LUCENE-3261: Facet benchmarking - indexing support - ported from 3x.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1180674 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doron Cohen 2011-10-09 18:01:36 +00:00
parent 1912c6c9c6
commit 664a7191dd
21 changed files with 904 additions and 188 deletions

View File

@ -141,6 +141,17 @@
<property name="analyzers-common.uptodate" value="true"/>
</target>
<property name="facet.jar" value="${common.dir}/../modules/facet/build/lucene-facet-${version}.jar"/>
<target name="check-facet-uptodate" unless="facet.uptodate">
<module-uptodate name="facet" jarfile="${facet.jar}" property="facet.uptodate"/>
</target>
<target name="jar-facet" unless="facet.uptodate" depends="check-facet-uptodate">
<ant dir="${common.dir}/../modules/facet" target="jar-core" inheritall="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
<property name="facet.uptodate" value="true"/>
</target>
<property name="analyzers-icu.jar" value="${common.dir}/../modules/analysis/build/icu/lucene-analyzers-icu-${version}.jar"/>
<target name="check-analyzers-icu-uptodate" unless="analyzers-icu.uptodate">
<module-uptodate name="analysis/icu" jarfile="${analyzers-icu.jar}" property="analyzers-icu.uptodate"/>

View File

@ -5,6 +5,10 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
For more information on past and future Lucene versions, please see:
http://s.apache.org/luceneversions
10/07/2011
LUCENE-3262: Facet benchmarking - Benchmark tasks and sources were added for indexing
with facets, demonstrated in facets.alg. (Gilad Barkai, Doron Cohen)
09/25/2011
LUCENE-3457: Upgrade commons-compress to 1.2 (and undo LUCENE-2980's workaround).
(Doron Cohen)

View File

@ -153,6 +153,7 @@
<pathelement path="${highlighter.jar}"/>
<pathelement path="${analyzers-common.jar}"/>
<pathelement path="${queryparser.jar}"/>
<pathelement path="${facet.jar}"/>
<path refid="base.classpath"/>
<fileset dir="lib">
<include name="**/*.jar"/>
@ -241,7 +242,7 @@
<echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}</echo>
</target>
<target name="init" depends="contrib-build.init,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser"/>
<target name="init" depends="contrib-build.init,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
<target name="clean-javacc">
<fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">

View File

@ -0,0 +1,72 @@
#/**
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
# -------------------------------------------------------------------------------------
with.facets=facets:true:false
content.source.forever=false
compound=true
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory
taxonomy.directory=FSDirectory
doc.stored=true
doc.tokenized=true
doc.term.vector=false
log.step=1000
docs.dir=reuters-out
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
facet.source=org.apache.lucene.benchmark.byTask.feeds.RandomFacetSource
rand.seed=10
max.doc.facets=20
max.facet.depth=3
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
task.max.depth.log=2
#log.queries=true
# -------------------------------------------------------------------------------------
{ "Rounds"
ResetSystemErase
{ "Populate"
-CreateIndex
-CreateTaxonomyIndex
{ "MAddDocs" AddFacetedDoc > : *
-Optimize
-CloseIndex
-CloseTaxonomyIndex
}
OpenReader
{ "SearchSameRdr" Search > : 40
CloseReader
#RepSumByNameRound
ResetSystemErase
NewRound
} : 4
RepSumByPrefRound Search
RepSumByPrefRound Populate
RepSumByPrefRound MAddDocs

View File

@ -24,6 +24,7 @@ import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.feeds.FacetSource;
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
import org.apache.lucene.benchmark.byTask.stats.Points;
import org.apache.lucene.benchmark.byTask.tasks.ReadTask;
@ -31,12 +32,15 @@ import org.apache.lucene.benchmark.byTask.tasks.SearchTask;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.FileUtils;
import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.IOUtils;
/**
* Data maintained by a performance test run.
@ -45,11 +49,21 @@ import org.apache.lucene.store.RAMDirectory;
* <ul>
* <li>Configuration.
* <li>Directory, Writer, Reader.
* <li>Docmaker and a few instances of QueryMaker.
* <li>Taxonomy Directory, Writer, Reader.
* <li>DocMaker, FacetSource and a few instances of QueryMaker.
* <li>Analyzer.
* <li>Statistics data which updated during the run.
* </ul>
* Config properties: work.dir=&lt;path to root of docs and index dirs| Default: work&gt;
* Config properties:
* <ul>
* <li><b>work.dir</b>=&lt;path to root of docs and index dirs| Default: work&gt;
* <li><b>analyzer</b>=&lt;class name for analyzer| Default: StandardAnalyzer&gt;
* <li><b>doc.maker</b>=&lt;class name for doc-maker| Default: DocMaker&gt;
* <li><b>facet.source</b>=&lt;class name for facet-source| Default: RandomFacetSource&gt;
* <li><b>query.maker</b>=&lt;class name for query-maker| Default: SimpleQueryMaker&gt;
* <li><b>log.queries</b>=&lt;whether queries should be printed| Default: false&gt;
* <li><b>directory</b>=&lt;type of directory to use for the index| Default: RAMDirectory&gt;
* <li><b>taxonomy.directory</b>=&lt;type of directory for taxonomy index| Default: RAMDirectory&gt;
* </ul>
*/
public class PerfRunData {
@ -62,7 +76,12 @@ public class PerfRunData {
private Directory directory;
private Analyzer analyzer;
private DocMaker docMaker;
private FacetSource facetSource;
private Locale locale;
private Directory taxonomyDir;
private TaxonomyWriter taxonomyWriter;
private TaxonomyReader taxonomyReader;
// we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
private HashMap<Class<? extends ReadTask>,QueryMaker> readTaskQueryMaker;
@ -73,6 +92,7 @@ public class PerfRunData {
private IndexWriter indexWriter;
private Config config;
private long startTimeMillis;
// constructor
public PerfRunData (Config config) throws Exception {
@ -84,6 +104,10 @@ public class PerfRunData {
docMaker = Class.forName(config.get("doc.maker",
"org.apache.lucene.benchmark.byTask.feeds.DocMaker")).asSubclass(DocMaker.class).newInstance();
docMaker.setConfig(config);
// facet source
facetSource = Class.forName(config.get("facet.source",
"org.apache.lucene.benchmark.byTask.feeds.RandomFacetSource")).asSubclass(FacetSource.class).newInstance();
facetSource.setConfig(config);
// query makers
readTaskQueryMaker = new HashMap<Class<? extends ReadTask>,QueryMaker>();
qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker")).asSubclass(QueryMaker.class);
@ -104,30 +128,17 @@ public class PerfRunData {
public void reinit(boolean eraseIndex) throws Exception {
// cleanup index
if (indexWriter!=null) {
indexWriter.close();
indexWriter = null;
}
if (indexReader!=null) {
indexReader.close();
indexReader = null;
}
if (directory!=null) {
directory.close();
}
IOUtils.close(indexWriter, indexReader, directory);
indexWriter = null;
indexReader = null;
IOUtils.close(taxonomyWriter, taxonomyReader, taxonomyDir);
taxonomyWriter = null;
taxonomyReader = null;
// directory (default is ram-dir).
if ("FSDirectory".equals(config.get("directory","RAMDirectory"))) {
File workDir = new File(config.get("work.dir","work"));
File indexDir = new File(workDir,"index");
if (eraseIndex && indexDir.exists()) {
FileUtils.fullyDelete(indexDir);
}
indexDir.mkdirs();
directory = FSDirectory.open(indexDir);
} else {
directory = new RAMDirectory();
}
directory = createDirectory(eraseIndex, "index", "directory");
taxonomyDir = createDirectory(eraseIndex, "taxo", "taxonomy.directory");
// inputs
resetInputs();
@ -139,6 +150,21 @@ public class PerfRunData {
// Re-init clock
setStartTimeMillis();
}
private Directory createDirectory(boolean eraseIndex, String dirName,
String dirParam) throws IOException {
if ("FSDirectory".equals(config.get(dirParam,"RAMDirectory"))) {
File workDir = new File(config.get("work.dir","work"));
File indexDir = new File(workDir,dirName);
if (eraseIndex && indexDir.exists()) {
FileUtils.fullyDelete(indexDir);
}
indexDir.mkdirs();
return FSDirectory.open(indexDir);
}
return new RAMDirectory();
}
public long setStartTimeMillis() {
startTimeMillis = System.currentTimeMillis();
@ -173,6 +199,57 @@ public class PerfRunData {
this.directory = directory;
}
/**
* @return Returns the taxonomy directory
*/
public Directory getTaxonomyDir() {
return taxonomyDir;
}
/**
* Set the taxonomy reader. Takes ownership of that taxonomy reader, that is,
* internally performs taxoReader.incRef() (If caller no longer needs that
* reader it should decRef()/close() it after calling this method, otherwise,
* the reader will remain open).
* @param taxoReader The taxonomy reader to set.
*/
public synchronized void setTaxonomyReader(TaxonomyReader taxoReader) throws IOException {
if (taxoReader == this.taxonomyReader) {
return;
}
if (taxonomyReader != null) {
taxonomyReader.decRef();
}
if (taxoReader != null) {
taxoReader.incRef();
}
this.taxonomyReader = taxoReader;
}
/**
* @return Returns the taxonomyReader. NOTE: this returns a
* reference. You must call TaxonomyReader.decRef() when
* you're done.
*/
public synchronized TaxonomyReader getTaxonomyReader() {
if (taxonomyReader != null) {
taxonomyReader.incRef();
}
return taxonomyReader;
}
/**
* @param taxoWriter The taxonomy writer to set.
*/
public void setTaxonomyWriter(TaxonomyWriter taxoWriter) {
this.taxonomyWriter = taxoWriter;
}
public TaxonomyWriter getTaxonomyWriter() {
return taxonomyWriter;
}
/**
* @return Returns the indexReader. NOTE: this returns a
* reference. You must call IndexReader.decRef() when
@ -198,13 +275,22 @@ public class PerfRunData {
}
/**
* Set the index reader. Takes ownership of that index reader, that is,
* internally performs indexReader.incRef() (If caller no longer needs that
* reader it should decRef()/close() it after calling this method, otherwise,
* the reader will remain open).
* @param indexReader The indexReader to set.
*/
public synchronized void setIndexReader(IndexReader indexReader) throws IOException {
if (indexReader == this.indexReader) {
return;
}
if (this.indexReader != null) {
// Release current IR
this.indexReader.decRef();
}
this.indexReader = indexReader;
if (indexReader != null) {
// Hold reference to new IR
@ -246,6 +332,11 @@ public class PerfRunData {
return docMaker;
}
/** Returns the facet source. */
public FacetSource getFacetSource() {
return facetSource;
}
/**
* @return the locale
*/
@ -269,6 +360,7 @@ public class PerfRunData {
public void resetInputs() throws IOException {
docMaker.resetInputs();
facetSource.resetInputs();
for (final QueryMaker queryMaker : readTaskQueryMaker.values()) {
queryMaker.resetInputs();
}

View File

@ -0,0 +1,180 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
/**
* Base class for source of data for benchmarking
* <p>
* Keeps track of various statistics, such as how many data items were generated,
* size in bytes etc.
* <p>
* Supports the following configuration parameters:
* <ul>
* <li><b>content.source.forever</b> - specifies whether to generate items
* forever (<b>default=true</b>).
* <li><b>content.source.verbose</b> - specifies whether messages should be
* output by the content source (<b>default=false</b>).
* <li><b>content.source.encoding</b> - specifies which encoding to use when
* reading the files of that content source. Certain implementations may define
* a default value if this parameter is not specified. (<b>default=null</b>).
* <li><b>content.source.log.step</b> - specifies for how many items a
* message should be logged. If set to 0 it means no logging should occur.
* <b>NOTE:</b> if verbose is set to false, logging should not occur even if
* logStep is not 0 (<b>default=0</b>).
* </ul>
*/
public abstract class ContentItemsSource {
private long bytesCount;
private long totalBytesCount;
private int itemCount;
private int totalItemCount;
private Config config;
private int lastPrintedNumUniqueTexts = 0;
private long lastPrintedNumUniqueBytes = 0;
private int printNum = 0;
protected boolean forever;
protected int logStep;
protected boolean verbose;
protected String encoding;
/** update count of bytes generated by this source */
protected final synchronized void addBytes(long numBytes) {
bytesCount += numBytes;
totalBytesCount += numBytes;
}
/** update count of items generated by this source */
protected final synchronized void addItem() {
++itemCount;
++totalItemCount;
}
/**
* A convenience method for collecting all the files of a content source from
* a given directory. The collected {@link File} instances are stored in the
* given <code>files</code>.
*/
protected final void collectFiles(File dir, ArrayList<File> files) {
if (!dir.canRead()) {
return;
}
File[] dirFiles = dir.listFiles();
Arrays.sort(dirFiles);
for (int i = 0; i < dirFiles.length; i++) {
File file = dirFiles[i];
if (file.isDirectory()) {
collectFiles(file, files);
} else if (file.canRead()) {
files.add(file);
}
}
}
/**
* Returns true whether it's time to log a message (depending on verbose and
* the number of items generated).
*/
protected final boolean shouldLog() {
return verbose && logStep > 0 && itemCount % logStep == 0;
}
/** Called when reading from this content source is no longer required. */
public abstract void close() throws IOException;
/** Returns the number of bytes generated since last reset. */
public final long getBytesCount() { return bytesCount; }
/** Returns the number of generated items since last reset. */
public final int getItemsCount() { return itemCount; }
public final Config getConfig() { return config; }
/** Returns the total number of bytes that were generated by this source. */
public final long getTotalBytesCount() { return totalBytesCount; }
/** Returns the total number of generated items. */
public final int getTotalItemsCount() { return totalItemCount; }
/**
* Resets the input for this content source, so that the test would behave as
* if it was just started, input-wise.
* <p>
* <b>NOTE:</b> the default implementation resets the number of bytes and
* items generated since the last reset, so it's important to call
* super.resetInputs in case you override this method.
*/
@SuppressWarnings("unused")
public void resetInputs() throws IOException {
bytesCount = 0;
itemCount = 0;
}
/**
* Sets the {@link Config} for this content source. If you override this
* method, you must call super.setConfig.
*/
public void setConfig(Config config) {
this.config = config;
forever = config.get("content.source.forever", true);
logStep = config.get("content.source.log.step", 0);
verbose = config.get("content.source.verbose", false);
encoding = config.get("content.source.encoding", null);
}
public void printStatistics(String itemsName) {
boolean print = false;
String col = " ";
StringBuilder sb = new StringBuilder();
String newline = System.getProperty("line.separator");
sb.append("------------> ").append(getClass().getSimpleName()).append(" statistics (").append(printNum).append("): ").append(newline);
int nut = getTotalItemsCount();
if (nut > lastPrintedNumUniqueTexts) {
print = true;
sb.append("total count of "+itemsName+": ").append(Format.format(0,nut,col)).append(newline);
lastPrintedNumUniqueTexts = nut;
}
long nub = getTotalBytesCount();
if (nub > lastPrintedNumUniqueBytes) {
print = true;
sb.append("total bytes of "+itemsName+": ").append(Format.format(0,nub,col)).append(newline);
lastPrintedNumUniqueBytes = nub;
}
if (getItemsCount() > 0) {
print = true;
sb.append("num "+itemsName+" added since last inputs reset: ").append(Format.format(0,getItemsCount(),col)).append(newline);
sb.append("total bytes added for "+itemsName+" since last inputs reset: ").append(Format.format(0,getBytesCount(),col)).append(newline);
}
if (print) {
System.out.println(sb.append(newline).toString());
printNum++;
}
}
}

View File

@ -17,12 +17,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.lucene.benchmark.byTask.utils.Config;
/**
* Represents content from a specified source, such as TREC, Reuters etc. A
@ -31,119 +26,13 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
* of various statistics, such as how many documents were generated, size in
* bytes etc.
* <p>
* Supports the following configuration parameters:
* <ul>
* <li><b>content.source.forever</b> - specifies whether to generate documents
* forever (<b>default=true</b>).
* <li><b>content.source.verbose</b> - specifies whether messages should be
* output by the content source (<b>default=false</b>).
* <li><b>content.source.encoding</b> - specifies which encoding to use when
* reading the files of that content source. Certain implementations may define
* a default value if this parameter is not specified. (<b>default=null</b>).
* <li><b>content.source.log.step</b> - specifies for how many documents a
* message should be logged. If set to 0 it means no logging should occur.
* <b>NOTE:</b> if verbose is set to false, logging should not occur even if
* logStep is not 0 (<b>default=0</b>).
* </ul>
* For supported configuration parameters see {@link ContentItemsSource}.
*/
public abstract class ContentSource {
public abstract class ContentSource extends ContentItemsSource {
private long bytesCount;
private long totalBytesCount;
private int docsCount;
private int totalDocsCount;
private Config config;
protected boolean forever;
protected int logStep;
protected boolean verbose;
protected String encoding;
/** update count of bytes generated by this source */
protected final synchronized void addBytes(long numBytes) {
bytesCount += numBytes;
totalBytesCount += numBytes;
}
/** update count of documents generated by this source */
protected final synchronized void addDoc() {
++docsCount;
++totalDocsCount;
}
/**
* A convenience method for collecting all the files of a content source from
* a given directory. The collected {@link File} instances are stored in the
* given <code>files</code>.
*/
protected final void collectFiles(File dir, ArrayList<File> files) {
if (!dir.canRead()) {
return;
}
File[] dirFiles = dir.listFiles();
Arrays.sort(dirFiles);
for (int i = 0; i < dirFiles.length; i++) {
File file = dirFiles[i];
if (file.isDirectory()) {
collectFiles(file, files);
} else if (file.canRead()) {
files.add(file);
}
}
}
/**
* Returns true whether it's time to log a message (depending on verbose and
* the number of documents generated).
*/
protected final boolean shouldLog() {
return verbose && logStep > 0 && docsCount % logStep == 0;
}
/** Called when reading from this content source is no longer required. */
public abstract void close() throws IOException;
/** Returns the number of bytes generated since last reset. */
public final long getBytesCount() { return bytesCount; }
/** Returns the number of generated documents since last reset. */
public final int getDocsCount() { return docsCount; }
public final Config getConfig() { return config; }
/** Returns the next {@link DocData} from the content source. */
/** Returns the next {@link DocData} from the content source.
* Implementations must account for multi-threading, as multiple threads
* can call this method simultaneously. */
public abstract DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException;
/** Returns the total number of bytes that were generated by this source. */
public final long getTotalBytesCount() { return totalBytesCount; }
/** Returns the total number of generated documents. */
public final int getTotalDocsCount() { return totalDocsCount; }
/**
* Resets the input for this content source, so that the test would behave as
* if it was just started, input-wise.
* <p>
* <b>NOTE:</b> the default implementation resets the number of bytes and
* documents generated since the last reset, so it's important to call
* super.resetInputs in case you override this method.
*/
public void resetInputs() throws IOException {
bytesCount = 0;
docsCount = 0;
}
/**
* Sets the {@link Config} for this content source. If you override this
* method, you must call super.setConfig.
*/
public void setConfig(Config config) {
this.config = config;
forever = config.get("content.source.forever", true);
logStep = config.get("content.source.log.step", 0);
verbose = config.get("content.source.verbose", false);
encoding = config.get("content.source.encoding", null);
}
}

View File

@ -31,7 +31,6 @@ import java.text.SimpleDateFormat;
import java.text.ParsePosition;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -186,13 +185,8 @@ public class DocMaker {
protected boolean reuseFields;
protected boolean indexProperties;
private int lastPrintedNumUniqueTexts = 0;
private long lastPrintedNumUniqueBytes = 0;
private final AtomicInteger numDocsCreated = new AtomicInteger();
private int printNum = 0;
public DocMaker() {
}
@ -400,38 +394,9 @@ public class DocMaker {
return doc;
}
public void printDocStatistics() {
boolean print = false;
String col = " ";
StringBuilder sb = new StringBuilder();
String newline = System.getProperty("line.separator");
sb.append("------------> ").append(getClass().getSimpleName()).append(" statistics (").append(printNum).append("): ").append(newline);
int nut = source.getTotalDocsCount();
if (nut > lastPrintedNumUniqueTexts) {
print = true;
sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline);
lastPrintedNumUniqueTexts = nut;
}
long nub = getTotalBytesCount();
if (nub > lastPrintedNumUniqueBytes) {
print = true;
sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
lastPrintedNumUniqueBytes = nub;
}
if (source.getDocsCount() > 0) {
print = true;
sb.append("num docs added since last inputs reset: ").append(Format.format(0,source.getDocsCount(),col)).append(newline);
sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getBytesCount(),col)).append(newline);
}
if (print) {
System.out.println(sb.append(newline).toString());
printNum++;
}
}
/** Reset inputs so that the test run would behave, input wise, as if it just started. */
public synchronized void resetInputs() throws IOException {
printDocStatistics();
source.printStatistics("docs");
// re-initiate since properties by round may have changed.
setConfig(config);
source.resetInputs();

View File

@ -0,0 +1,45 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.facet.index.CategoryContainer;
/**
* Source items for facets.
* <p>
* For supported configuration parameters see {@link ContentItemsSource}.
*/
public abstract class FacetSource extends ContentItemsSource {
/** Returns the next {@link CategoryContainer facets content item}.
* Implementations must account for multi-threading, as multiple threads
* can call this method simultaneously.
*/
public abstract CategoryContainer getNextFacets(CategoryContainer facets) throws NoMoreDataException, IOException;
@Override
public void resetInputs() throws IOException {
printStatistics("facets");
// re-initiate since properties by round may have changed.
setConfig(getConfig());
super.resetInputs();
}
}

View File

@ -0,0 +1,81 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.facet.index.CategoryContainer;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Simple implementation of a random facet source
* <p>
* Supports the following parameters:
* <ul>
* <li><b>rand.seed</b> - defines the seed to initialize Random with (default: <b>13</b>).
* <li><b>max.doc.facets</b> - maximal #facets per doc (default: <b>10</b>).
* Actual number of facets in a certain doc would be anything between 1 and that number.
* <li><b>max.facet.depth</b> - maximal #components in a facet (default: <b>3</b>).
* Actual number of components in a certain facet would be anything between 1 and that number.
* </ul>
*/
public class RandomFacetSource extends FacetSource {
Random random;
private int maxDocFacets = 10;
private int maxFacetDepth = 3;
private int maxValue = maxDocFacets * maxFacetDepth;
@Override
public CategoryContainer getNextFacets(CategoryContainer facets) throws NoMoreDataException, IOException {
if (facets == null) {
facets = new CategoryContainer();
} else {
facets.clear();
}
int numFacets = 1 + random.nextInt(maxDocFacets-1); // at least one facet to each doc
for (int i=0; i<numFacets; i++) {
CategoryPath cp = new CategoryPath();
int depth = 1 + random.nextInt(maxFacetDepth-1); // depth 0 is not useful
for (int k=0; k<depth; k++) {
cp.add(Integer.toString(random.nextInt(maxValue)));
addItem();
}
facets.addCategory(cp);
addBytes(cp.toString().length()); // very rough approximation
}
return facets;
}
@Override
public void close() throws IOException {
// nothing to do here
}
@Override
public void setConfig(Config config) {
super.setConfig(config);
random = new Random(config.get("rand.seed", 13));
maxDocFacets = config.get("max.doc.facets", 200);
maxFacetDepth = config.get("max.facet.depth", 10);
maxValue = maxDocFacets * maxFacetDepth;
}
}

View File

@ -289,7 +289,7 @@ public class TrecContentSource extends ContentSource {
// here, everything else is already private to that thread, so we're safe.
try {
docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
addDoc();
addItem();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
}

View File

@ -17,12 +17,14 @@ package org.apache.lucene.benchmark.byTask.tasks;
* limitations under the License.
*/
import java.text.NumberFormat;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.document.Document;
/**
* Add a document, optionally with of a certain size.
* Add a document, optionally of a certain size.
* <br>Other side effects: none.
* <br>Takes optional param: document size.
*/
@ -34,9 +36,12 @@ public class AddDocTask extends PerfTask {
private int docSize = 0;
// volatile data passed between setup(), doLogic(), tearDown().
private Document doc = null;
/**
* volatile data passed between setup(), doLogic(), tearDown().
* the doc is created at setup() and added at doLogic().
*/
protected Document doc = null;
@Override
public void setup() throws Exception {
super.setup();
@ -56,7 +61,7 @@ public class AddDocTask extends PerfTask {
@Override
protected String getLogMessage(int recsCount) {
return "added " + recsCount + " docs";
return String.format("added %9d docs",recsCount);
}
@Override

View File

@ -0,0 +1,77 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.FacetSource;
import org.apache.lucene.facet.index.CategoryContainer;
import org.apache.lucene.facet.index.CategoryDocumentBuilder;
/**
* Add a faceted document.
* <p>
* Config properties:
* <ul>
* <li><b>with.facets</b>=&lt;tells whether to actually add any facets to the document| Default: true&gt;
* <br>This config property allows to easily compare the performance of adding docs with and without facets.
* Note that facets are created even when this is false, just that they are not added to the document (nor to the taxonomy).
* </ul>
* <p>
* See {@link AddDocTask} for general document parameters and configuration.
* <p>
* Makes use of the {@link FacetSource} in effect - see {@link PerfRunData} for facet source settings.
*/
public class AddFacetedDocTask extends AddDocTask {
public AddFacetedDocTask(PerfRunData runData) {
super(runData);
}
private CategoryContainer facets = null;
private CategoryDocumentBuilder categoryDocBuilder = null;
private boolean withFacets = true;
@Override
public void setup() throws Exception {
super.setup();
// create the facets even if they should not be added - allows to measure the effect of just adding facets
facets = getRunData().getFacetSource().getNextFacets(facets);
withFacets = getRunData().getConfig().get("with.facets", true);
if (withFacets) {
categoryDocBuilder = new CategoryDocumentBuilder(getRunData().getTaxonomyWriter());
categoryDocBuilder.setCategories(facets);
}
}
@Override
protected String getLogMessage(int recsCount) {
if (!withFacets) {
return super.getLogMessage(recsCount);
}
return super.getLogMessage(recsCount)+ " with facets";
}
@Override
public int doLogic() throws Exception {
if (withFacets) {
categoryDocBuilder.build(doc);
}
return super.doLogic();
}
}

View File

@ -0,0 +1,43 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.util.IOUtils;
/**
* Close taxonomy index.
* <br>Other side effects: taxonomy writer object in perfRunData is nullified.
*/
public class CloseTaxonomyIndexTask extends PerfTask {
public CloseTaxonomyIndexTask(PerfRunData runData) {
super(runData);
}
@Override
public int doLogic() throws IOException {
IOUtils.close(getRunData().getTaxonomyWriter());
getRunData().setTaxonomyWriter(null);
return 1;
}
}

View File

@ -0,0 +1,46 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
/**
* Close taxonomy reader.
* <br>Other side effects: taxonomy reader in perfRunData is nullified.
*/
public class CloseTaxonomyReaderTask extends PerfTask {
public CloseTaxonomyReaderTask(PerfRunData runData) {
super(runData);
}
@Override
public int doLogic() throws IOException {
TaxonomyReader taxoReader = getRunData().getTaxonomyReader();
getRunData().setTaxonomyReader(null);
if (taxoReader.getRefCount() != 1) {
System.out.println("WARNING: CloseTaxonomyReader: reference count is currently " + taxoReader.getRefCount());
}
taxoReader.close();
return 1;
}
}

View File

@ -0,0 +1,41 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
/**
* Commits the Taxonomy Index.
*/
public class CommitTaxonomyIndexTask extends PerfTask {
public CommitTaxonomyIndexTask(PerfRunData runData) {
super(runData);
}
@Override
public int doLogic() throws Exception {
TaxonomyWriter taxonomyWriter = getRunData().getTaxonomyWriter();
if (taxonomyWriter != null) {
taxonomyWriter.commit();
} else {
throw new IllegalStateException("TaxonomyWriter is not currently open");
}
return 1;
}
}

View File

@ -0,0 +1,44 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import java.io.IOException;
/**
* Create a taxonomy index.
* <br>Other side effects: taxonomy writer object in perfRunData is set.
*/
public class CreateTaxonomyIndexTask extends PerfTask {
public CreateTaxonomyIndexTask(PerfRunData runData) {
super(runData);
}
@Override
public int doLogic() throws IOException {
PerfRunData runData = getRunData();
runData.setTaxonomyWriter(new LuceneTaxonomyWriter(runData.getTaxonomyDir(), OpenMode.CREATE));
return 1;
}
}

View File

@ -0,0 +1,42 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter;
import java.io.IOException;
/**
* Open a taxonomy index.
* <br>Other side effects: taxonomy writer object in perfRunData is set.
*/
public class OpenTaxonomyIndexTask extends PerfTask {
public OpenTaxonomyIndexTask(PerfRunData runData) {
super(runData);
}
@Override
public int doLogic() throws IOException {
PerfRunData runData = getRunData();
runData.setTaxonomyWriter(new LuceneTaxonomyWriter(runData.getTaxonomyDir()));
return 1;
}
}

View File

@ -0,0 +1,45 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader;
/**
* Open a taxonomy index reader.
* <br>Other side effects: taxonomy reader object in perfRunData is set.
*/
public class OpenTaxonomyReaderTask extends PerfTask {
public OpenTaxonomyReaderTask(PerfRunData runData) {
super(runData);
}
@Override
public int doLogic() throws IOException {
PerfRunData runData = getRunData();
LuceneTaxonomyReader taxoReader = new LuceneTaxonomyReader(runData.getTaxonomyDir());
runData.setTaxonomyReader(taxoReader);
// We transfer reference to the run data
taxoReader.decRef();
return 1;
}
}

View File

@ -17,8 +17,6 @@ package org.apache.lucene.benchmark.byTask.tasks;
* limitations under the License.
*/
import java.text.NumberFormat;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.stats.Points;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
@ -270,9 +268,7 @@ public abstract class PerfTask implements Cloneable {
public void tearDown() throws Exception {
if (++logStepCount % logStep == 0) {
double time = (System.currentTimeMillis() - runData.getStartTimeMillis()) / 1000.0;
NumberFormat nf = NumberFormat.getInstance();
nf.setMaximumFractionDigits(2);
System.out.println(nf.format(time) + " sec --> "
System.out.println(String.format("%7.2f",time) + " sec --> "
+ Thread.currentThread().getName() + " " + getLogMessage(logStepCount));
}
}

View File

@ -40,6 +40,7 @@ import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
@ -780,6 +781,42 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
reader.close();
}
/**
* Test indexing with facets tasks.
*/
public void testIndexingWithFacets() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=100",
"content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"merge.factor=3",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"ResetSystemErase",
"CreateIndex",
"CreateTaxonomyIndex",
"{ \"AddDocs\" AddFacetedDoc > : * ",
"CloseIndex",
"CloseTaxonomyIndex",
"OpenTaxonomyReader",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
PerfRunData runData = benchmark.getRunData();
assertNull("taxo writer was not properly closed",runData.getTaxonomyWriter());
TaxonomyReader taxoReader = runData.getTaxonomyReader();
assertNotNull("taxo reader was not opened", taxoReader);
assertTrue("nothing was added to the taxnomy (expecting root and at least one addtional category)",taxoReader.getSize()>1);
taxoReader.close();
}
/**
* Test that we can call optimize(maxNumSegments).
*/