LUCENE-6224: cut over more package.htmls

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1658399 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2015-02-09 13:56:55 +00:00
parent e0ff51383c
commit ac5e4dd337
41 changed files with 1226 additions and 1276 deletions

View File

@ -0,0 +1,29 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This package provides dictionary-driven lemmatization ("accurate stemming")
* filter and analyzer for the Polish Language, driven by the
* <a href="http://morfologik.blogspot.com/">Morfologik library</a> developed
* by Dawid Weiss and Marcin Miłkowski.
* <p>
* The MorfologikFilter yields one or more terms for each token. Each
* of those terms is given the same position in the index.
* </p>
*/
package org.apache.lucene.analysis.morfologik;

View File

@ -1,34 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
</head>
<body>
<p>
This package provides dictionary-driven lemmatization ("accurate stemming")
filter and analyzer for the Polish Language, driven by the
<a href="http://morfologik.blogspot.com/">Morfologik library</a> developed
by Dawid Weiss and Marcin Miłkowski.
</p>
<p>
The MorfologikFilter yields one or more terms for each token. Each
of those terms is given the same position in the index.
</p>
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Analysis components for phonetic search.
*/
package org.apache.lucene.analysis.phonetic;

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analysis components for phonetic search.
</body>
</html>

View File

@ -0,0 +1,22 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* SmartChineseAnalyzer Hidden Markov Model package.
* @lucene.experimental
*/
package org.apache.lucene.analysis.cn.smart.hhmm;

View File

@ -1,25 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
SmartChineseAnalyzer Hidden Markov Model package.
@lucene.experimental
</body>
</html>

View File

@ -0,0 +1,37 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Analyzer for Simplified Chinese, which indexes words.
* @lucene.experimental
* <div>
* Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
* <ul>
* <li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
* <li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
* <li>SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
* </ul>
*
* Example phrase "我是中国人"
* <ol>
* <li>StandardAnalyzer: </li>
* <li>CJKAnalyzer: 我是是中中国国人</li>
* <li>SmartChineseAnalyzer: 中国</li>
* </ol>
* </div>
*/
package org.apache.lucene.analysis.cn.smart;

View File

@ -1,42 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
Analyzer for Simplified Chinese, which indexes words.
@lucene.experimental
<div>
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
<ul>
<li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
<li>SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
</ul>
Example phrase "我是中国人"
<ol>
<li>StandardAnalyzer: 我-是-中-国-人</li>
<li>CJKAnalyzer: 我是-是中-中国-国人</li>
<li>SmartChineseAnalyzer: 我-是-中国-人</li>
</ol>
</div>
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Analyzer for Polish.
*/
package org.apache.lucene.analysis.pl;

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Polish.
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Stempel: Algorithmic Stemmer
*/
package org.apache.lucene.analysis.stempel;

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
<p>Stempel: Algorithmic Stemmer</p>
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Egothor stemmer API.
*/
package org.egothor.stemmer;

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Egothor stemmer API.
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Integration with UIMA's AnalysisEngine.
*/
package org.apache.lucene.analysis.uima.ae;

View File

@ -1,21 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Integration with UIMA's AnalysisEngine
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Classes that integrate UIMA with Lucene's analysis API.
*/
package org.apache.lucene.analysis.uima;

View File

@ -1,21 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Classes that integrate UIMA with Lucene's analysis API.
</body>
</html>

View File

@ -15,6 +15,7 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- not a package-info.java, because we already defined this package in core/ -->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Sources for benchmark inputs: documents and queries.
*/
package org.apache.lucene.benchmark.byTask.feeds;

View File

@ -1,23 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Sources for benchmark inputs: documents and queries.
</body>
</html>

View File

@ -0,0 +1,719 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Benchmarking Lucene By Tasks
* <p>
* This package provides "task based" performance benchmarking of Lucene.
* One can use the predefined benchmarks, or create new ones.
* </p>
* <p>
* Contained packages:
* </p>
*
* <table border=1 cellpadding=4 summary="table of benchmark packages">
* <tr>
* <td><b>Package</b></td>
* <td><b>Description</b></td>
* </tr>
* <tr>
* <td><a href="stats/package-summary.html">stats</a></td>
* <td>Statistics maintained when running benchmark tasks.</td>
* </tr>
* <tr>
* <td><a href="tasks/package-summary.html">tasks</a></td>
* <td>Benchmark tasks.</td>
* </tr>
* <tr>
* <td><a href="feeds/package-summary.html">feeds</a></td>
* <td>Sources for benchmark inputs: documents and queries.</td>
* </tr>
* <tr>
* <td><a href="utils/package-summary.html">utils</a></td>
* <td>Utilities used for the benchmark, and for the reports.</td>
* </tr>
* <tr>
* <td><a href="programmatic/package-summary.html">programmatic</a></td>
* <td>Sample performance test written programmatically.</td>
* </tr>
* </table>
*
* <h2>Table Of Contents</h2>
* <ol>
* <li><a href="#concept">Benchmarking By Tasks</a></li>
* <li><a href="#usage">How to use</a></li>
* <li><a href="#algorithm">Benchmark "algorithm"</a></li>
* <li><a href="#tasks">Supported tasks/commands</a></li>
* <li><a href="#properties">Benchmark properties</a></li>
* <li><a href="#example">Example input algorithm and the result benchmark
* report.</a></li>
* <li><a href="#recsCounting">Results record counting clarified</a></li>
* </ol>
* <a name="concept"></a>
* <h2>Benchmarking By Tasks</h2>
* <p>
* Benchmark Lucene using task primitives.
* </p>
*
* <p>
* A benchmark is composed of some predefined tasks, allowing for creating an
* index, adding documents,
* optimizing, searching, generating reports, and more. A benchmark run takes an
* "algorithm" file
* that contains a description of the sequence of tasks making up the run, and some
* properties defining a few
* additional characteristics of the benchmark run.
* </p>
*
* <a name="usage"></a>
* <h2>How to use</h2>
* <p>
* Easiest way to run a benchmarks is using the predefined ant task:
* <ul>
* <li>ant run-task
* <br>- would run the <code>micro-standard.alg</code> "algorithm".
* </li>
* <li>ant run-task -Dtask.alg=conf/compound-penalty.alg
* <br>- would run the <code>compound-penalty.alg</code> "algorithm".
* </li>
* <li>ant run-task -Dtask.alg=[full-path-to-your-alg-file]
* <br>- would run <code>your perf test</code> "algorithm".
* </li>
* <li>java org.apache.lucene.benchmark.byTask.programmatic.Sample
* <br>- would run a performance test programmatically - without using an alg
* file. This is less readable, and less convenient, but possible.
* </li>
* </ul>
*
* <p>
* You may find existing tasks sufficient for defining the benchmark <i>you</i>
* need, otherwise, you can extend the framework to meet your needs, as explained
* herein.
* </p>
*
* <p>
* Each benchmark run has a DocMaker and a QueryMaker. These two should usually
* match, so that "meaningful" queries are used for a certain collection.
* Properties set at the header of the alg file define which "makers" should be
* used. You can also specify your own makers, extending DocMaker and implementing
* QueryMaker.
* <blockquote>
* <b>Note:</b> since 2.9, DocMaker is a concrete class which accepts a
* ContentSource. In most cases, you can use the DocMaker class to create
* Documents, while providing your own ContentSource implementation. For
* example, the current Benchmark package includes ContentSource
* implementations for TREC, Enwiki and Reuters collections, as well as
* others like LineDocSource which reads a 'line' file produced by
* WriteLineDocTask.
* </blockquote>
*
* <p>
* Benchmark .alg file contains the benchmark "algorithm". The syntax is described
* below. Within the algorithm, you can specify groups of commands, assign them
* names, specify commands that should be repeated,
* do commands in serial or in parallel,
* and also control the speed of "firing" the commands.
* </p>
*
* <p>
* This allows, for instance, to specify
* that an index should be opened for update,
* documents should be added to it one by one but not faster than 20 docs a minute,
* and, in parallel with this,
* some N queries should be searched against that index,
* again, no more than 2 queries a second.
* You can have the searches all share an index reader,
* or have them each open its own reader and close it afterwords.
* </p>
*
* <p>
* If the commands available for use in the algorithm do not meet your needs,
* you can add commands by adding a new task under
* org.apache.lucene.benchmark.byTask.tasks -
* you should extend the PerfTask abstract class.
* Make sure that your new task class name is suffixed by Task.
* Assume you added the class "WonderfulTask" - doing so also enables the
* command "Wonderful" to be used in the algorithm.
* </p>
*
* <p>
* <u>External classes</u>: It is sometimes useful to invoke the benchmark
* package with your external alg file that configures the use of your own
* doc/query maker and or html parser. You can work this out without
* modifying the benchmark package code, by passing your class path
* with the benchmark.ext.classpath property:
* <ul>
* <li>ant run-task -Dtask.alg=[full-path-to-your-alg-file]
* <span style="color: #FF0000">-Dbenchmark.ext.classpath=/mydir/classes
* </span> -Dtask.mem=512M</li>
* </ul>
* <p>
* <u>External tasks</u>: When writing your own tasks under a package other than
* <b>org.apache.lucene.benchmark.byTask.tasks</b> specify that package thru the
* <span style="color: #FF0000">alt.tasks.packages</span> property.
*
* <a name="algorithm"></a>
* <h2>Benchmark "algorithm"</h2>
*
* <p>
* The following is an informal description of the supported syntax.
* </p>
*
* <ol>
* <li>
* <b>Measuring</b>: When a command is executed, statistics for the elapsed
* execution time and memory consumption are collected.
* At any time, those statistics can be printed, using one of the
* available ReportTasks.
* </li>
* <li>
* <b>Comments</b> start with '<span style="color: #FF0066">#</span>'.
* </li>
* <li>
* <b>Serial</b> sequences are enclosed within '<span style="color: #FF0066">{ }</span>'.
* </li>
* <li>
* <b>Parallel</b> sequences are enclosed within
* '<span style="color: #FF0066">[ ]</span>'
* </li>
* <li>
* <b>Sequence naming:</b> To name a sequence, put
* '<span style="color: #FF0066">"name"</span>' just after
* '<span style="color: #FF0066">{</span>' or '<span style="color: #FF0066">[</span>'.
* <br>Example - <span style="color: #FF0066">{ "ManyAdds" AddDoc } : 1000000</span> -
* would
* name the sequence of 1M add docs "ManyAdds", and this name would later appear
* in statistic reports.
* If you don't specify a name for a sequence, it is given one: you can see it as
* the algorithm is printed just before benchmark execution starts.
* </li>
* <li>
* <b>Repeating</b>:
* To repeat sequence tasks N times, add '<span style="color: #FF0066">: N</span>' just
* after the
* sequence closing tag - '<span style="color: #FF0066">}</span>' or
* '<span style="color: #FF0066">]</span>' or '<span style="color: #FF0066">&gt;</span>'.
* <br>Example - <span style="color: #FF0066">[ AddDoc ] : 4</span> - would do 4 addDoc
* in parallel, spawning 4 threads at once.
* <br>Example - <span style="color: #FF0066">[ AddDoc AddDoc ] : 4</span> - would do
* 8 addDoc in parallel, spawning 8 threads at once.
* <br>Example - <span style="color: #FF0066">{ AddDoc } : 30</span> - would do addDoc
* 30 times in a row.
* <br>Example - <span style="color: #FF0066">{ AddDoc AddDoc } : 30</span> - would do
* addDoc 60 times in a row.
* <br><b>Exhaustive repeating</b>: use <span style="color: #FF0066">*</span> instead of
* a number to repeat exhaustively.
* This is sometimes useful, for adding as many files as a doc maker can create,
* without iterating over the same file again, especially when the exact
* number of documents is not known in advance. For instance, TREC files extracted
* from a zip file. Note: when using this, you must also set
* <span style="color: #FF0066">content.source.forever</span> to false.
* <br>Example - <span style="color: #FF0066">{ AddDoc } : *</span> - would add docs
* until the doc maker is "exhausted".
* </li>
* <li>
* <b>Command parameter</b>: a command can optionally take a single parameter.
* If the certain command does not support a parameter, or if the parameter is of
* the wrong type,
* reading the algorithm will fail with an exception and the test would not start.
* Currently the following tasks take optional parameters:
* <ul>
* <li><b>AddDoc</b> takes a numeric parameter, indicating the required size of
* added document. Note: if the DocMaker implementation used in the test
* does not support makeDoc(size), an exception would be thrown and the test
* would fail.
* </li>
* <li><b>DeleteDoc</b> takes numeric parameter, indicating the docid to be
* deleted. The latter is not very useful for loops, since the docid is
* fixed, so for deletion in loops it is better to use the
* <code>doc.delete.step</code> property.
* </li>
* <li><b>SetProp</b> takes a <code>name,value</code> mandatory param,
* ',' used as a separator.
* </li>
* <li><b>SearchTravRetTask</b> and <b>SearchTravTask</b> take a numeric
* parameter, indicating the required traversal size.
* </li>
* <li><b>SearchTravRetLoadFieldSelectorTask</b> takes a string
* parameter: a comma separated list of Fields to load.
* </li>
* <li><b>SearchTravRetHighlighterTask</b> takes a string
* parameter: a comma separated list of parameters to define highlighting. See that
* tasks javadocs for more information
* </li>
* </ul>
* <br>Example - <span style="color: #FF0066">AddDoc(2000)</span> - would add a document
* of size 2000 (~bytes).
* <br>See conf/task-sample.alg for how this can be used, for instance, to check
* which is faster, adding
* many smaller documents, or few larger documents.
* Next candidates for supporting a parameter may be the Search tasks,
* for controlling the query size.
* </li>
* <li>
* <b>Statistic recording elimination</b>: - a sequence can also end with
* '<span style="color: #FF0066">&gt;</span>',
* in which case child tasks would not store their statistics.
* This can be useful to avoid exploding stats data, for adding say 1M docs.
* <br>Example - <span style="color: #FF0066">{ "ManyAdds" AddDoc &gt; : 1000000</span> -
* would add million docs, measure that total, but not save stats for each addDoc.
* <br>Notice that the granularity of System.currentTimeMillis() (which is used
* here) is system dependant,
* and in some systems an operation that takes 5 ms to complete may show 0 ms
* latency time in performance measurements.
* Therefore it is sometimes more accurate to look at the elapsed time of a larger
* sequence, as demonstrated here.
* </li>
* <li>
* <b>Rate</b>:
* To set a rate (ops/sec or ops/min) for a sequence, add
* '<span style="color: #FF0066">: N : R</span>' just after sequence closing tag.
* This would specify repetition of N with rate of R operations/sec.
* Use '<span style="color: #FF0066">R/sec</span>' or
* '<span style="color: #FF0066">R/min</span>'
* to explicitly specify that the rate is per second or per minute.
* The default is per second,
* <br>Example - <span style="color: #FF0066">[ AddDoc ] : 400 : 3</span> - would do 400
* addDoc in parallel, starting up to 3 threads per second.
* <br>Example - <span style="color: #FF0066">{ AddDoc } : 100 : 200/min</span> - would
* do 100 addDoc serially,
* waiting before starting next add, if otherwise rate would exceed 200 adds/min.
* </li>
* <li>
* <b>Disable Counting</b>: Each task executed contributes to the records count.
* This count is reflected in reports under recs/s and under recsPerRun.
* Most tasks count 1, some count 0, and some count more.
* (See <a href="#recsCounting">Results record counting clarified</a> for more details.)
* It is possible to disable counting for a task by preceding it with <span style="color: #FF0066">-</span>.
* <br>Example - <span style="color: #FF0066"> -CreateIndex </span> - would count 0 while
* the default behavior for CreateIndex is to count 1.
* </li>
* <li>
* <b>Command names</b>: Each class "AnyNameTask" in the
* package org.apache.lucene.benchmark.byTask.tasks,
* that extends PerfTask, is supported as command "AnyName" that can be
* used in the benchmark "algorithm" description.
* This allows to add new commands by just adding such classes.
* </li>
* </ol>
*
*
* <a name="tasks"></a>
* <h2>Supported tasks/commands</h2>
*
* <p>
* Existing tasks can be divided into a few groups:
* regular index/search work tasks, report tasks, and control tasks.
* </p>
*
* <ol>
*
* <li>
* <b>Report tasks</b>: There are a few Report commands for generating reports.
* Only task runs that were completed are reported.
* (The 'Report tasks' themselves are not measured and not reported.)
* <ul>
* <li>
* <span style="color: #FF0066">RepAll</span> - all (completed) task runs.
* </li>
* <li>
* <span style="color: #FF0066">RepSumByName</span> - all statistics,
* aggregated by name. So, if AddDoc was executed 2000 times,
* only 1 report line would be created for it, aggregating all those
* 2000 statistic records.
* </li>
* <li>
* <span style="color: #FF0066">RepSelectByPref &nbsp; prefixWord</span> - all
* records for tasks whose name start with
* <span style="color: #FF0066">prefixWord</span>.
* </li>
* <li>
* <span style="color: #FF0066">RepSumByPref &nbsp; prefixWord</span> - all
* records for tasks whose name start with
* <span style="color: #FF0066">prefixWord</span>,
* aggregated by their full task name.
* </li>
* <li>
* <span style="color: #FF0066">RepSumByNameRound</span> - all statistics,
* aggregated by name and by <span style="color: #FF0066">Round</span>.
* So, if AddDoc was executed 2000 times in each of 3
* <span style="color: #FF0066">rounds</span>, 3 report lines would be
* created for it,
* aggregating all those 2000 statistic records in each round.
* See more about rounds in the <span style="color: #FF0066">NewRound</span>
* command description below.
* </li>
* <li>
* <span style="color: #FF0066">RepSumByPrefRound &nbsp; prefixWord</span> -
* similar to <span style="color: #FF0066">RepSumByNameRound</span>,
* just that only tasks whose name starts with
* <span style="color: #FF0066">prefixWord</span> are included.
* </li>
* </ul>
* If needed, additional reports can be added by extending the abstract class
* ReportTask, and by
* manipulating the statistics data in Points and TaskStats.
* </li>
*
* <li><b>Control tasks</b>: Few of the tasks control the benchmark algorithm
* all over:
* <ul>
* <li>
* <span style="color: #FF0066">ClearStats</span> - clears the entire statistics.
* Further reports would only include task runs that would start after this
* call.
* </li>
* <li>
* <span style="color: #FF0066">NewRound</span> - virtually start a new round of
* performance test.
* Although this command can be placed anywhere, it mostly makes sense at
* the end of an outermost sequence.
* <br>This increments a global "round counter". All task runs that
* would start now would
* record the new, updated round counter as their round number.
* This would appear in reports.
* In particular, see <span style="color: #FF0066">RepSumByNameRound</span> above.
* <br>An additional effect of NewRound, is that numeric and boolean
* properties defined (at the head
* of the .alg file) as a sequence of values, e.g. <span style="color: #FF0066">
* merge.factor=mrg:10:100:10:100</span> would
* increment (cyclic) to the next value.
* Note: this would also be reflected in the reports, in this case under a
* column that would be named "mrg".
* </li>
* <li>
* <span style="color: #FF0066">ResetInputs</span> - DocMaker and the
* various QueryMakers
* would reset their counters to start.
* The way these Maker interfaces work, each call for makeDocument()
* or makeQuery() creates the next document or query
* that it "knows" to create.
* If that pool is "exhausted", the "maker" start over again.
* The ResetInputs command
* therefore allows to make the rounds comparable.
* It is therefore useful to invoke ResetInputs together with NewRound.
* </li>
* <li>
* <span style="color: #FF0066">ResetSystemErase</span> - reset all index
* and input data and call gc.
* Does NOT reset statistics. This contains ResetInputs.
* All writers/readers are nullified, deleted, closed.
* Index is erased.
* Directory is erased.
* You would have to call CreateIndex once this was called...
* </li>
* <li>
* <span style="color: #FF0066">ResetSystemSoft</span> - reset all
* index and input data and call gc.
* Does NOT reset statistics. This contains ResetInputs.
* All writers/readers are nullified, closed.
* Index is NOT erased.
* Directory is NOT erased.
* This is useful for testing performance on an existing index,
* for instance if the construction of a large index
* took a very long time and now you would to test
* its search or update performance.
* </li>
* </ul>
* </li>
*
* <li>
* Other existing tasks are quite straightforward and would
* just be briefly described here.
* <ul>
* <li>
* <span style="color: #FF0066">CreateIndex</span> and
* <span style="color: #FF0066">OpenIndex</span> both leave the
* index open for later update operations.
* <span style="color: #FF0066">CloseIndex</span> would close it.
* <li>
* <span style="color: #FF0066">OpenReader</span>, similarly, would
* leave an index reader open for later search operations.
* But this have further semantics.
* If a Read operation is performed, and an open reader exists,
* it would be used.
* Otherwise, the read operation would open its own reader
* and close it when the read operation is done.
* This allows testing various scenarios - sharing a reader,
* searching with "cold" reader, with "warmed" reader, etc.
* The read operations affected by this are:
* <span style="color: #FF0066">Warm</span>,
* <span style="color: #FF0066">Search</span>,
* <span style="color: #FF0066">SearchTrav</span> (search and traverse),
* and <span style="color: #FF0066">SearchTravRet</span> (search
* and traverse and retrieve).
* Notice that each of the 3 search task types maintains
* its own queryMaker instance.
* <li>
* <span style="color: #FF0066">CommitIndex</span> and
* <span style="color: #FF0066">ForceMerge</span> can be used to commit
* changes to the index then merge the index segments. The integer
* parameter specifies how many segments to merge down to (default
* 1).
* <li>
* <span style="color: #FF0066">WriteLineDoc</span> prepares a 'line'
* file where each line holds a document with <i>title</i>,
* <i>date</i> and <i>body</i> elements, separated by [TAB].
* A line file is useful if one wants to measure pure indexing
* performance, without the overhead of parsing the data.<br>
* You can use LineDocSource as a ContentSource over a 'line'
* file.
* <li>
* <span style="color: #FF0066">ConsumeContentSource</span> consumes
* a ContentSource. Useful for e.g. testing a ContentSource
* performance, without the overhead of preparing a Document
* out of it.
* </ul>
* </li>
* </ol>
*
* <a name="properties"></a>
* <h2>Benchmark properties</h2>
*
* <p>
* Properties are read from the header of the .alg file, and
* define several parameters of the performance test.
* As mentioned above for the <span style="color: #FF0066">NewRound</span> task,
* numeric and boolean properties that are defined as a sequence
* of values, e.g. <span style="color: #FF0066">merge.factor=mrg:10:100:10:100</span>
* would increment (cyclic) to the next value,
* when NewRound is called, and would also
* appear as a named column in the reports (column
* name would be "mrg" in this example).
* </p>
*
* <p>
* Some of the currently defined properties are:
* </p>
*
* <ol>
* <li>
* <span style="color: #FF0066">analyzer</span> - full
* class name for the analyzer to use.
* Same analyzer would be used in the entire test.
* </li>
*
* <li>
* <span style="color: #FF0066">directory</span> - valid values are
* This tells which directory to use for the performance test.
* </li>
*
* <li>
* <b>Index work parameters</b>:
* Multi int/boolean values would be iterated with calls to NewRound.
* There would be also added as columns in the reports, first string in the
* sequence is the column name.
* (Make sure it is no shorter than any value in the sequence).
* <ul>
* <li><span style="color: #FF0066">max.buffered</span>
* <br>Example: max.buffered=buf:10:10:100:100 -
* this would define using maxBufferedDocs of 10 in iterations 0 and 1,
* and 100 in iterations 2 and 3.
* </li>
* <li>
* <span style="color: #FF0066">merge.factor</span> - which
* merge factor to use.
* </li>
* <li>
* <span style="color: #FF0066">compound</span> - whether the index is
* using the compound format or not. Valid values are "true" and "false".
* </li>
* </ul>
* </ol>
*
* <p>
* Here is a list of currently defined properties:
* </p>
* <ol>
*
* <li><b>Root directory for data and indexes:</b>
* <ul><li>work.dir (default is System property "benchmark.work.dir" or "work".)
* </li></ul>
* </li>
*
* <li><b>Docs and queries creation:</b>
* <ul><li>analyzer
* </li><li>doc.maker
* </li><li>content.source.forever
* </li><li>html.parser
* </li><li>doc.stored
* </li><li>doc.tokenized
* </li><li>doc.term.vector
* </li><li>doc.term.vector.positions
* </li><li>doc.term.vector.offsets
* </li><li>doc.store.body.bytes
* </li><li>docs.dir
* </li><li>query.maker
* </li><li>file.query.maker.file
* </li><li>file.query.maker.default.field
* </li><li>search.num.hits
* </li></ul>
* </li>
*
* <li><b>Logging</b>:
* <ul><li>log.step
* </li><li>log.step.[class name]Task ie log.step.DeleteDoc (e.g. log.step.Wonderful for the WonderfulTask example above).
* </li><li>log.queries
* </li><li>task.max.depth.log
* </li></ul>
* </li>
*
* <li><b>Index writing</b>:
* <ul><li>compound
* </li><li>merge.factor
* </li><li>max.buffered
* </li><li>directory
* </li><li>ram.flush.mb
* </li><li>codec.postingsFormat (eg Direct) Note: no codec should be specified through default.codec
* </li></ul>
* </li>
*
* <li><b>Doc deletion</b>:
* <ul><li>doc.delete.step
* </li></ul>
* </li>
*
* <li><b>Spatial</b>: Numerous; see spatial.alg
* </li>
*
* <li><b>Task alternative packages</b>:
* <ul><li>alt.tasks.packages
* - comma separated list of additional packages where tasks classes will be looked for
* when not found in the default package (that of PerfTask). If the same task class
* appears in more than one package, the package indicated first in this list will be used.
* </li></ul>
* </li>
*
* </ol>
*
* <p>
* For sample use of these properties see the *.alg files under conf.
* </p>
*
* <a name="example"></a>
* <h2>Example input algorithm and the result benchmark report</h2>
* <p>
* The following example is in conf/sample.alg:
* <pre>
* <span style="color: #003333"># --------------------------------------------------------
* #
* # Sample: what is the effect of doc size on indexing time?
* #
* # There are two parts in this test:
* # - PopulateShort adds 2N documents of length L
* # - PopulateLong adds N documents of length 2L
* # Which one would be faster?
* # The comparison is done twice.
* #
* # --------------------------------------------------------
* </span>
* <span style="color: #990066"># -------------------------------------------------------------------------------------
* # multi val params are iterated by NewRound's, added to reports, start with column name.
* merge.factor=mrg:10:20
* max.buffered=buf:100:1000
* compound=true
*
* analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
* directory=FSDirectory
*
* doc.stored=true
* doc.tokenized=true
* doc.term.vector=false
* doc.add.log.step=500
*
* docs.dir=reuters-out
*
* doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
*
* query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
*
* # task at this depth or less would print when they start
* task.max.depth.log=2
*
* log.queries=false
* # -------------------------------------------------------------------------------------</span>
* <span style="color: #3300FF">{
*
* { "PopulateShort"
* CreateIndex
* { AddDoc(4000) &gt; : 20000
* Optimize
* CloseIndex
* &gt;
*
* ResetSystemErase
*
* { "PopulateLong"
* CreateIndex
* { AddDoc(8000) &gt; : 10000
* Optimize
* CloseIndex
* &gt;
*
* ResetSystemErase
*
* NewRound
*
* } : 2
*
* RepSumByName
* RepSelectByPref Populate
* </span>
* </pre>
*
* <p>
* The command line for running this sample:
* <br><code>ant run-task -Dtask.alg=conf/sample.alg</code>
* </p>
*
* <p>
* The output report from running this test contains the following:
* <pre>
* Operation round mrg buf runCnt recsPerRun rec/s elapsedSec avgUsedMem avgTotalMem
* PopulateShort 0 10 100 1 20003 119.6 167.26 12,959,120 14,241,792
* PopulateLong - - 0 10 100 - - 1 - - 10003 - - - 74.3 - - 134.57 - 17,085,208 - 20,635,648
* PopulateShort 1 20 1000 1 20003 143.5 139.39 63,982,040 94,756,864
* PopulateLong - - 1 20 1000 - - 1 - - 10003 - - - 77.0 - - 129.92 - 87,309,608 - 100,831,232
* </pre>
*
* <a name="recsCounting"></a>
* <h2>Results record counting clarified</h2>
* <p>
* Two columns in the results table indicate records counts: records-per-run and
* records-per-second. What does it mean?
* </p><p>
* Almost every task gets 1 in this count just for being executed.
* Task sequences aggregate the counts of their child tasks,
* plus their own count of 1.
* So, a task sequence containing 5 other task sequences, each running a single
* other task 10 times, would have a count of 1 + 5 * (1 + 10) = 56.
* </p><p>
* The traverse and retrieve tasks "count" more: a traverse task
* would add 1 for each traversed result (hit), and a retrieve task would
* additionally add 1 for each retrieved doc. So, regular Search would
* count 1, SearchTrav that traverses 10 hits would count 11, and a
* SearchTravRet task that retrieves (and traverses) 10, would count 21.
* </p><p>
* Confusing? this might help: always examine the <code>elapsedSec</code> column,
* and always compare "apples to apples", .i.e. it is interesting to check how the
* <code>rec/s</code> changed for the same task (or sequence) between two
* different runs, but it is not very useful to know how the <code>rec/s</code>
* differs between <code>Search</code> and <code>SearchTrav</code> tasks. For
* the latter, <code>elapsedSec</code> would bring more insight.
* </p>
*/
package org.apache.lucene.benchmark.byTask;

View File

@ -1,733 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<HTML>
<HEAD>
<TITLE>Benchmarking Lucene By Tasks</TITLE>
</HEAD>
<BODY>
Benchmarking Lucene By Tasks.
<DIV>
<p>
This package provides "task based" performance benchmarking of Lucene.
One can use the predefined benchmarks, or create new ones.
</p>
<p>
Contained packages:
</p>
<table border=1 cellpadding=4>
<tr>
<td><b>Package</b></td>
<td><b>Description</b></td>
</tr>
<tr>
<td><a href="stats/package-summary.html">stats</a></td>
<td>Statistics maintained when running benchmark tasks.</td>
</tr>
<tr>
<td><a href="tasks/package-summary.html">tasks</a></td>
<td>Benchmark tasks.</td>
</tr>
<tr>
<td><a href="feeds/package-summary.html">feeds</a></td>
<td>Sources for benchmark inputs: documents and queries.</td>
</tr>
<tr>
<td><a href="utils/package-summary.html">utils</a></td>
<td>Utilities used for the benchmark, and for the reports.</td>
</tr>
<tr>
<td><a href="programmatic/package-summary.html">programmatic</a></td>
<td>Sample performance test written programmatically.</td>
</tr>
</table>
<h2>Table Of Contents</h2>
<p>
<ol>
<li><a href="#concept">Benchmarking By Tasks</a></li>
<li><a href="#usage">How to use</a></li>
<li><a href="#algorithm">Benchmark "algorithm"</a></li>
<li><a href="#tasks">Supported tasks/commands</a></li>
<li><a href="#properties">Benchmark properties</a></li>
<li><a href="#example">Example input algorithm and the result benchmark
report.</a></li>
<li><a href="#recsCounting">Results record counting clarified</a></li>
</ol>
</p>
<a name="concept"></a>
<h2>Benchmarking By Tasks</h2>
<p>
Benchmark Lucene using task primitives.
</p>
<p>
A benchmark is composed of some predefined tasks, allowing for creating an
index, adding documents,
optimizing, searching, generating reports, and more. A benchmark run takes an
"algorithm" file
that contains a description of the sequence of tasks making up the run, and some
properties defining a few
additional characteristics of the benchmark run.
</p>
<a name="usage"></a>
<h2>How to use</h2>
<p>
Easiest way to run a benchmarks is using the predefined ant task:
<ul>
<li>ant run-task
<br>- would run the <code>micro-standard.alg</code> "algorithm".
</li>
<li>ant run-task -Dtask.alg=conf/compound-penalty.alg
<br>- would run the <code>compound-penalty.alg</code> "algorithm".
</li>
<li>ant run-task -Dtask.alg=[full-path-to-your-alg-file]
<br>- would run <code>your perf test</code> "algorithm".
</li>
<li>java org.apache.lucene.benchmark.byTask.programmatic.Sample
<br>- would run a performance test programmatically - without using an alg
file. This is less readable, and less convenient, but possible.
</li>
</ul>
</p>
<p>
You may find existing tasks sufficient for defining the benchmark <i>you</i>
need, otherwise, you can extend the framework to meet your needs, as explained
herein.
</p>
<p>
Each benchmark run has a DocMaker and a QueryMaker. These two should usually
match, so that "meaningful" queries are used for a certain collection.
Properties set at the header of the alg file define which "makers" should be
used. You can also specify your own makers, extending DocMaker and implementing
QueryMaker.
<blockquote>
<b>Note:</b> since 2.9, DocMaker is a concrete class which accepts a
ContentSource. In most cases, you can use the DocMaker class to create
Documents, while providing your own ContentSource implementation. For
example, the current Benchmark package includes ContentSource
implementations for TREC, Enwiki and Reuters collections, as well as
others like LineDocSource which reads a 'line' file produced by
WriteLineDocTask.
</blockquote>
</p>
<p>
Benchmark .alg file contains the benchmark "algorithm". The syntax is described
below. Within the algorithm, you can specify groups of commands, assign them
names, specify commands that should be repeated,
do commands in serial or in parallel,
and also control the speed of "firing" the commands.
</p>
<p>
This allows, for instance, to specify
that an index should be opened for update,
documents should be added to it one by one but not faster than 20 docs a minute,
and, in parallel with this,
some N queries should be searched against that index,
again, no more than 2 queries a second.
You can have the searches all share an index reader,
or have them each open its own reader and close it afterwords.
</p>
<p>
If the commands available for use in the algorithm do not meet your needs,
you can add commands by adding a new task under
org.apache.lucene.benchmark.byTask.tasks -
you should extend the PerfTask abstract class.
Make sure that your new task class name is suffixed by Task.
Assume you added the class "WonderfulTask" - doing so also enables the
command "Wonderful" to be used in the algorithm.
</p>
<p>
<u>External classes</u>: It is sometimes useful to invoke the benchmark
package with your external alg file that configures the use of your own
doc/query maker and or html parser. You can work this out without
modifying the benchmark package code, by passing your class path
with the benchmark.ext.classpath property:
<ul>
<li>ant run-task -Dtask.alg=[full-path-to-your-alg-file]
<font color="#FF0000">-Dbenchmark.ext.classpath=/mydir/classes
</font> -Dtask.mem=512M</li>
</ul>
<u>External tasks</u>: When writing your own tasks under a package other than
<b>org.apache.lucene.benchmark.byTask.tasks</b> specify that package thru the
<font color="#FF0000">alt.tasks.packages</font> property.
</p>
<a name="algorithm"></a>
<h2>Benchmark "algorithm"</h2>
<p>
The following is an informal description of the supported syntax.
</p>
<ol>
<li>
<b>Measuring</b>: When a command is executed, statistics for the elapsed
execution time and memory consumption are collected.
At any time, those statistics can be printed, using one of the
available ReportTasks.
</li>
<li>
<b>Comments</b> start with '<font color="#FF0066">#</font>'.
</li>
<li>
<b>Serial</b> sequences are enclosed within '<font color="#FF0066">{ }</font>'.
</li>
<li>
<b>Parallel</b> sequences are enclosed within
'<font color="#FF0066">[ ]</font>'
</li>
<li>
<b>Sequence naming:</b> To name a sequence, put
'<font color="#FF0066">"name"</font>' just after
'<font color="#FF0066">{</font>' or '<font color="#FF0066">[</font>'.
<br>Example - <font color="#FF0066">{ "ManyAdds" AddDoc } : 1000000</font> -
would
name the sequence of 1M add docs "ManyAdds", and this name would later appear
in statistic reports.
If you don't specify a name for a sequence, it is given one: you can see it as
the algorithm is printed just before benchmark execution starts.
</li>
<li>
<b>Repeating</b>:
To repeat sequence tasks N times, add '<font color="#FF0066">: N</font>' just
after the
sequence closing tag - '<font color="#FF0066">}</font>' or
'<font color="#FF0066">]</font>' or '<font color="#FF0066">></font>'.
<br>Example - <font color="#FF0066">[ AddDoc ] : 4</font> - would do 4 addDoc
in parallel, spawning 4 threads at once.
<br>Example - <font color="#FF0066">[ AddDoc AddDoc ] : 4</font> - would do
8 addDoc in parallel, spawning 8 threads at once.
<br>Example - <font color="#FF0066">{ AddDoc } : 30</font> - would do addDoc
30 times in a row.
<br>Example - <font color="#FF0066">{ AddDoc AddDoc } : 30</font> - would do
addDoc 60 times in a row.
<br><b>Exhaustive repeating</b>: use <font color="#FF0066">*</font> instead of
a number to repeat exhaustively.
This is sometimes useful, for adding as many files as a doc maker can create,
without iterating over the same file again, especially when the exact
number of documents is not known in advance. For instance, TREC files extracted
from a zip file. Note: when using this, you must also set
<font color="#FF0066">content.source.forever</font> to false.
<br>Example - <font color="#FF0066">{ AddDoc } : *</font> - would add docs
until the doc maker is "exhausted".
</li>
<li>
<b>Command parameter</b>: a command can optionally take a single parameter.
If the certain command does not support a parameter, or if the parameter is of
the wrong type,
reading the algorithm will fail with an exception and the test would not start.
Currently the following tasks take optional parameters:
<ul>
<li><b>AddDoc</b> takes a numeric parameter, indicating the required size of
added document. Note: if the DocMaker implementation used in the test
does not support makeDoc(size), an exception would be thrown and the test
would fail.
</li>
<li><b>DeleteDoc</b> takes numeric parameter, indicating the docid to be
deleted. The latter is not very useful for loops, since the docid is
fixed, so for deletion in loops it is better to use the
<code>doc.delete.step</code> property.
</li>
<li><b>SetProp</b> takes a <code>name,value</code> mandatory param,
',' used as a separator.
</li>
<li><b>SearchTravRetTask</b> and <b>SearchTravTask</b> take a numeric
parameter, indicating the required traversal size.
</li>
<li><b>SearchTravRetLoadFieldSelectorTask</b> takes a string
parameter: a comma separated list of Fields to load.
</li>
<li><b>SearchTravRetHighlighterTask</b> takes a string
parameter: a comma separated list of parameters to define highlighting. See that
tasks javadocs for more information
</li>
</ul>
<br>Example - <font color="#FF0066">AddDoc(2000)</font> - would add a document
of size 2000 (~bytes).
<br>See conf/task-sample.alg for how this can be used, for instance, to check
which is faster, adding
many smaller documents, or few larger documents.
Next candidates for supporting a parameter may be the Search tasks,
for controlling the query size.
</li>
<li>
<b>Statistic recording elimination</b>: - a sequence can also end with
'<font color="#FF0066">></font>',
in which case child tasks would not store their statistics.
This can be useful to avoid exploding stats data, for adding say 1M docs.
<br>Example - <font color="#FF0066">{ "ManyAdds" AddDoc > : 1000000</font> -
would add million docs, measure that total, but not save stats for each addDoc.
<br>Notice that the granularity of System.currentTimeMillis() (which is used
here) is system dependant,
and in some systems an operation that takes 5 ms to complete may show 0 ms
latency time in performance measurements.
Therefore it is sometimes more accurate to look at the elapsed time of a larger
sequence, as demonstrated here.
</li>
<li>
<b>Rate</b>:
To set a rate (ops/sec or ops/min) for a sequence, add
'<font color="#FF0066">: N : R</font>' just after sequence closing tag.
This would specify repetition of N with rate of R operations/sec.
Use '<font color="#FF0066">R/sec</font>' or
'<font color="#FF0066">R/min</font>'
to explicitly specify that the rate is per second or per minute.
The default is per second,
<br>Example - <font color="#FF0066">[ AddDoc ] : 400 : 3</font> - would do 400
addDoc in parallel, starting up to 3 threads per second.
<br>Example - <font color="#FF0066">{ AddDoc } : 100 : 200/min</font> - would
do 100 addDoc serially,
waiting before starting next add, if otherwise rate would exceed 200 adds/min.
</li>
<li>
<b>Disable Counting</b>: Each task executed contributes to the records count.
This count is reflected in reports under recs/s and under recsPerRun.
Most tasks count 1, some count 0, and some count more.
(See <a href="#recsCounting">Results record counting clarified</a> for more details.)
It is possible to disable counting for a task by preceding it with <font color="#FF0066">-</font>.
<br>Example - <font color="#FF0066"> -CreateIndex </font> - would count 0 while
the default behavior for CreateIndex is to count 1.
</li>
<li>
<b>Command names</b>: Each class "AnyNameTask" in the
package org.apache.lucene.benchmark.byTask.tasks,
that extends PerfTask, is supported as command "AnyName" that can be
used in the benchmark "algorithm" description.
This allows to add new commands by just adding such classes.
</li>
</ol>
<a name="tasks"></a>
<h2>Supported tasks/commands</h2>
<p>
Existing tasks can be divided into a few groups:
regular index/search work tasks, report tasks, and control tasks.
</p>
<ol>
<li>
<b>Report tasks</b>: There are a few Report commands for generating reports.
Only task runs that were completed are reported.
(The 'Report tasks' themselves are not measured and not reported.)
<ul>
<li>
<font color="#FF0066">RepAll</font> - all (completed) task runs.
</li>
<li>
<font color="#FF0066">RepSumByName</font> - all statistics,
aggregated by name. So, if AddDoc was executed 2000 times,
only 1 report line would be created for it, aggregating all those
2000 statistic records.
</li>
<li>
<font color="#FF0066">RepSelectByPref &nbsp; prefixWord</font> - all
records for tasks whose name start with
<font color="#FF0066">prefixWord</font>.
</li>
<li>
<font color="#FF0066">RepSumByPref &nbsp; prefixWord</font> - all
records for tasks whose name start with
<font color="#FF0066">prefixWord</font>,
aggregated by their full task name.
</li>
<li>
<font color="#FF0066">RepSumByNameRound</font> - all statistics,
aggregated by name and by <font color="#FF0066">Round</font>.
So, if AddDoc was executed 2000 times in each of 3
<font color="#FF0066">rounds</font>, 3 report lines would be
created for it,
aggregating all those 2000 statistic records in each round.
See more about rounds in the <font color="#FF0066">NewRound</font>
command description below.
</li>
<li>
<font color="#FF0066">RepSumByPrefRound &nbsp; prefixWord</font> -
similar to <font color="#FF0066">RepSumByNameRound</font>,
just that only tasks whose name starts with
<font color="#FF0066">prefixWord</font> are included.
</li>
</ul>
If needed, additional reports can be added by extending the abstract class
ReportTask, and by
manipulating the statistics data in Points and TaskStats.
</li>
<li><b>Control tasks</b>: Few of the tasks control the benchmark algorithm
all over:
<ul>
<li>
<font color="#FF0066">ClearStats</font> - clears the entire statistics.
Further reports would only include task runs that would start after this
call.
</li>
<li>
<font color="#FF0066">NewRound</font> - virtually start a new round of
performance test.
Although this command can be placed anywhere, it mostly makes sense at
the end of an outermost sequence.
<br>This increments a global "round counter". All task runs that
would start now would
record the new, updated round counter as their round number.
This would appear in reports.
In particular, see <font color="#FF0066">RepSumByNameRound</font> above.
<br>An additional effect of NewRound, is that numeric and boolean
properties defined (at the head
of the .alg file) as a sequence of values, e.g. <font color="#FF0066">
merge.factor=mrg:10:100:10:100</font> would
increment (cyclic) to the next value.
Note: this would also be reflected in the reports, in this case under a
column that would be named "mrg".
</li>
<li>
<font color="#FF0066">ResetInputs</font> - DocMaker and the
various QueryMakers
would reset their counters to start.
The way these Maker interfaces work, each call for makeDocument()
or makeQuery() creates the next document or query
that it "knows" to create.
If that pool is "exhausted", the "maker" start over again.
The ResetInputs command
therefore allows to make the rounds comparable.
It is therefore useful to invoke ResetInputs together with NewRound.
</li>
<li>
<font color="#FF0066">ResetSystemErase</font> - reset all index
and input data and call gc.
Does NOT reset statistics. This contains ResetInputs.
All writers/readers are nullified, deleted, closed.
Index is erased.
Directory is erased.
You would have to call CreateIndex once this was called...
</li>
<li>
<font color="#FF0066">ResetSystemSoft</font> - reset all
index and input data and call gc.
Does NOT reset statistics. This contains ResetInputs.
All writers/readers are nullified, closed.
Index is NOT erased.
Directory is NOT erased.
This is useful for testing performance on an existing index,
for instance if the construction of a large index
took a very long time and now you would to test
its search or update performance.
</li>
</ul>
</li>
<li>
Other existing tasks are quite straightforward and would
just be briefly described here.
<ul>
<li>
<font color="#FF0066">CreateIndex</font> and
<font color="#FF0066">OpenIndex</font> both leave the
index open for later update operations.
<font color="#FF0066">CloseIndex</font> would close it.
<li>
<font color="#FF0066">OpenReader</font>, similarly, would
leave an index reader open for later search operations.
But this have further semantics.
If a Read operation is performed, and an open reader exists,
it would be used.
Otherwise, the read operation would open its own reader
and close it when the read operation is done.
This allows testing various scenarios - sharing a reader,
searching with "cold" reader, with "warmed" reader, etc.
The read operations affected by this are:
<font color="#FF0066">Warm</font>,
<font color="#FF0066">Search</font>,
<font color="#FF0066">SearchTrav</font> (search and traverse),
and <font color="#FF0066">SearchTravRet</font> (search
and traverse and retrieve).
Notice that each of the 3 search task types maintains
its own queryMaker instance.
<li>
<font color="#FF0066">CommitIndex</font> and
<font color="#FF0066">ForceMerge</font> can be used to commit
changes to the index then merge the index segments. The integer
parameter specifies how many segments to merge down to (default
1).
<li>
<font color="#FF0066">WriteLineDoc</font> prepares a 'line'
file where each line holds a document with <i>title</i>,
<i>date</i> and <i>body</i> elements, separated by [TAB].
A line file is useful if one wants to measure pure indexing
performance, without the overhead of parsing the data.<br>
You can use LineDocSource as a ContentSource over a 'line'
file.
<li>
<font color="#FF0066">ConsumeContentSource</font> consumes
a ContentSource. Useful for e.g. testing a ContentSource
performance, without the overhead of preparing a Document
out of it.
</ul>
</li>
</ol>
<a name="properties"></a>
<h2>Benchmark properties</h2>
<p>
Properties are read from the header of the .alg file, and
define several parameters of the performance test.
As mentioned above for the <font color="#FF0066">NewRound</font> task,
numeric and boolean properties that are defined as a sequence
of values, e.g. <font color="#FF0066">merge.factor=mrg:10:100:10:100</font>
would increment (cyclic) to the next value,
when NewRound is called, and would also
appear as a named column in the reports (column
name would be "mrg" in this example).
</p>
<p>
Some of the currently defined properties are:
</p>
<ol>
<li>
<font color="#FF0066">analyzer</font> - full
class name for the analyzer to use.
Same analyzer would be used in the entire test.
</li>
<li>
<font color="#FF0066">directory</font> - valid values are
This tells which directory to use for the performance test.
</li>
<li>
<b>Index work parameters</b>:
Multi int/boolean values would be iterated with calls to NewRound.
There would be also added as columns in the reports, first string in the
sequence is the column name.
(Make sure it is no shorter than any value in the sequence).
<ul>
<li><font color="#FF0066">max.buffered</font>
<br>Example: max.buffered=buf:10:10:100:100 -
this would define using maxBufferedDocs of 10 in iterations 0 and 1,
and 100 in iterations 2 and 3.
</li>
<li>
<font color="#FF0066">merge.factor</font> - which
merge factor to use.
</li>
<li>
<font color="#FF0066">compound</font> - whether the index is
using the compound format or not. Valid values are "true" and "false".
</li>
</ul>
</ol>
<p>
Here is a list of currently defined properties:
</p>
<ol>
<li><b>Root directory for data and indexes:</b></li>
<ul><li>work.dir (default is System property "benchmark.work.dir" or "work".)
</li></ul>
</li>
<li><b>Docs and queries creation:</b></li>
<ul><li>analyzer
</li><li>doc.maker
</li><li>content.source.forever
</li><li>html.parser
</li><li>doc.stored
</li><li>doc.tokenized
</li><li>doc.term.vector
</li><li>doc.term.vector.positions
</li><li>doc.term.vector.offsets
</li><li>doc.store.body.bytes
</li><li>docs.dir
</li><li>query.maker
</li><li>file.query.maker.file
</li><li>file.query.maker.default.field
</li><li>search.num.hits
</li></ul>
</li>
<li><b>Logging</b>:
<ul><li>log.step
</li><li>log.step.[class name]Task ie log.step.DeleteDoc (e.g. log.step.Wonderful for the WonderfulTask example above).
</li><li>log.queries
</li><li>task.max.depth.log
</li></ul>
</li>
<li><b>Index writing</b>:
<ul><li>compound
</li><li>merge.factor
</li><li>max.buffered
</li><li>directory
</li><li>ram.flush.mb
</li><li>codec.postingsFormat (eg Direct) Note: no codec should be specified through default.codec
</li></ul>
</li>
<li><b>Doc deletion</b>:
<ul><li>doc.delete.step
</li></ul>
</li>
<li><b>Spatial</b>: Numerous; see spatial.alg
</li>
<li><b>Task alternative packages</b>:
<ul><li>alt.tasks.packages
- comma separated list of additional packages where tasks classes will be looked for
when not found in the default package (that of PerfTask). If the same task class
appears in more than one package, the package indicated first in this list will be used.
</li></ul>
</li>
</ol>
<p>
For sample use of these properties see the *.alg files under conf.
</p>
<a name="example"></a>
<h2>Example input algorithm and the result benchmark report</h2>
<p>
The following example is in conf/sample.alg:
<pre>
<font color="#003333"># --------------------------------------------------------
#
# Sample: what is the effect of doc size on indexing time?
#
# There are two parts in this test:
# - PopulateShort adds 2N documents of length L
# - PopulateLong adds N documents of length 2L
# Which one would be faster?
# The comparison is done twice.
#
# --------------------------------------------------------
</font>
<font color="#990066"># -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
merge.factor=mrg:10:20
max.buffered=buf:100:1000
compound=true
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory
doc.stored=true
doc.tokenized=true
doc.term.vector=false
doc.add.log.step=500
docs.dir=reuters-out
doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
# task at this depth or less would print when they start
task.max.depth.log=2
log.queries=false
# -------------------------------------------------------------------------------------</font>
<font color="#3300FF">{
{ "PopulateShort"
CreateIndex
{ AddDoc(4000) > : 20000
Optimize
CloseIndex
>
ResetSystemErase
{ "PopulateLong"
CreateIndex
{ AddDoc(8000) > : 10000
Optimize
CloseIndex
>
ResetSystemErase
NewRound
} : 2
RepSumByName
RepSelectByPref Populate
</font>
</pre>
</p>
<p>
The command line for running this sample:
<br><code>ant run-task -Dtask.alg=conf/sample.alg</code>
</p>
<p>
The output report from running this test contains the following:
<pre>
Operation round mrg buf runCnt recsPerRun rec/s elapsedSec avgUsedMem avgTotalMem
PopulateShort 0 10 100 1 20003 119.6 167.26 12,959,120 14,241,792
PopulateLong - - 0 10 100 - - 1 - - 10003 - - - 74.3 - - 134.57 - 17,085,208 - 20,635,648
PopulateShort 1 20 1000 1 20003 143.5 139.39 63,982,040 94,756,864
PopulateLong - - 1 20 1000 - - 1 - - 10003 - - - 77.0 - - 129.92 - 87,309,608 - 100,831,232
</pre>
</p>
<a name="recsCounting"></a>
<h2>Results record counting clarified</h2>
<p>
Two columns in the results table indicate records counts: records-per-run and
records-per-second. What does it mean?
</p><p>
Almost every task gets 1 in this count just for being executed.
Task sequences aggregate the counts of their child tasks,
plus their own count of 1.
So, a task sequence containing 5 other task sequences, each running a single
other task 10 times, would have a count of 1 + 5 * (1 + 10) = 56.
</p><p>
The traverse and retrieve tasks "count" more: a traverse task
would add 1 for each traversed result (hit), and a retrieve task would
additionally add 1 for each retrieved doc. So, regular Search would
count 1, SearchTrav that traverses 10 hits would count 11, and a
SearchTravRet task that retrieves (and traverses) 10, would count 21.
</p><p>
Confusing? this might help: always examine the <code>elapsedSec</code> column,
and always compare "apples to apples", .i.e. it is interesting to check how the
<code>rec/s</code> changed for the same task (or sequence) between two
different runs, but it is not very useful to know how the <code>rec/s</code>
differs between <code>Search</code> and <code>SearchTrav</code> tasks. For
the latter, <code>elapsedSec</code> would bring more insight.
</p>
</DIV>
<DIV>&nbsp;</DIV>
</BODY>
</HTML>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Sample performance test written programmatically - no algorithm file is needed here.
*/
package org.apache.lucene.benchmark.byTask.programmatic;

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Sample performance test written programmatically - no algorithm file is needed here.
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Statistics maintained when running benchmark tasks.
*/
package org.apache.lucene.benchmark.byTask.stats;

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Statistics maintained when running benchmark tasks.
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Extendable benchmark tasks.
*/
package org.apache.lucene.benchmark.byTask.tasks;

View File

@ -1,26 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="Author" content="Doron Cohen">
</head>
<body>
Extendable benchmark tasks.
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Utilities used for the benchmark, and for the reports.
*/
package org.apache.lucene.benchmark.byTask.utils;

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Utilities used for the benchmark, and for the reports.
</body>
</html>

View File

@ -0,0 +1,44 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Lucene Benchmarking Package
* <p>
* The benchmark contribution contains tools for benchmarking Lucene using standard, freely available corpora.
* <p>
* ANT will download the corpus automatically, place it in a temp directory and then unpack it to the
* working.dir directory specified in the build. The temp directory and working directory can be safely
* removed after a run. However, the next time the task is run, it will need to download the files again.
* <p>
* Classes implementing the Benchmarker interface should have a no-argument constructor if they are to be
* used with the Driver class. The Driver class is provided for convenience only. Feel free to implement
* your own main class for your benchmarker.
* <p>
* The StandardBenchmarker is meant to be just that, a standard that runs out of the box with no
* configuration or changes needed. Other benchmarking classes may derive from it to provide alternate
* views or to take in command line options. When reporting benchmarking runs you should state any
* alterations you have made.
* <p>
* To run the short version of the StandardBenchmarker, call "ant run-micro-standard". This should take
* a minute or so to complete and give you a preliminary idea of how your change affects the code.
* <p>
* To run the long version of the StandardBenchmarker, call "ant run-standard". This takes considerably longer.
* <p>
* The original code for these classes was donated by Andrzej Bialecki at http://issues.apache.org/jira/browse/LUCENE-675
* and has been updated by Grant Ingersoll to make some parts of the code reusable in other benchmarkers
*/
package org.apache.lucene.benchmark;

View File

@ -1,46 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<HTML>
<HEAD>
<TITLE>Lucene Benchmarking Package</TITLE>
</HEAD>
<BODY>
The benchmark contribution contains tools for benchmarking Lucene using standard, freely available corpora.
<DIV>
<p/>
ANT will
download the corpus automatically, place it in a temp directory and then unpack it to the working.dir directory specified in the build.
The temp directory
and working directory can be safely removed after a run. However, the next time the task is run, it will need to download the files again.
<p/>
Classes implementing the Benchmarker interface should have a no-argument constructor if they are to be used with the Driver class. The Driver
class is provided for convenience only. Feel free to implement your own main class for your benchmarker.
<p/>
The StandardBenchmarker is meant to be just that, a standard that runs out of the box with no configuration or changes needed.
Other benchmarking classes may derive from it to provide alternate views or to take in command line options. When reporting benchmarking runs
you should state any alterations you have made.
<p/>
To run the short version of the StandardBenchmarker, call "ant run-micro-standard". This should take a minute or so to complete and give you a preliminary idea of how your change affects the code
<p/>
To run the long version of the StandardBenchmarker, call "ant run-standard". This takes considerably longer.
<p/>
The original code for these classes was donated by Andrzej Bialecki at http://issues.apache.org/jira/browse/LUCENE-675 and has been updated by Grant Ingersoll to make some parts of the code reusable in other benchmarkers
</DIV>
<DIV>&nbsp;</DIV>
</BODY>
</HTML>

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Search Quality Benchmarking.
* <p>
* This package allows to benchmark search quality of a Lucene application.
* <p>
* In order to use this package you should provide:
* <ul>
* <li>A <a href="{@docRoot}/../core/org/apache/lucene/search/IndexSearcher.html">IndexSearcher</a>.</li>
* <li>{@link org.apache.lucene.benchmark.quality.QualityQuery Quality queries}.</li>
* <li>{@link org.apache.lucene.benchmark.quality.Judge Judging object}.</li>
* <li>{@link org.apache.lucene.benchmark.quality.utils.SubmissionReport Reporting object}.</li>
* </ul>
* <p>
* For benchmarking TREC collections with TREC QRels, take a look at the
* {@link org.apache.lucene.benchmark.quality.trec trec package}.
* <p>
* Here is a sample code used to run the TREC 2006 queries 701-850 on the .Gov2 collection:
*
* <pre class="prettyprint">
* File topicsFile = new File("topics-701-850.txt");
* File qrelsFile = new File("qrels-701-850.txt");
* IndexReader ir = DirectoryReader.open(directory):
* IndexSearcher searcher = new IndexSearcher(ir);
*
* int maxResults = 1000;
* String docNameField = "docname";
*
* PrintWriter logger = new PrintWriter(System.out,true);
*
* // use trec utilities to read trec topics into quality queries
* TrecTopicsReader qReader = new TrecTopicsReader();
* QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile)));
*
* // prepare judge, with trec utilities that read from a QRels file
* Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile)));
*
* // validate topics &amp; judgments match each other
* judge.validateData(qqs, logger);
*
* // set the parsing of quality queries into Lucene queries.
* QualityQueryParser qqParser = new SimpleQQParser("title", "body");
*
* // run the benchmark
* QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);
* SubmissionReport submitLog = null;
* QualityStats stats[] = qrun.execute(maxResults, judge, submitLog, logger);
*
* // print an average sum of the results
* QualityStats avg = QualityStats.average(stats);
* avg.log("SUMMARY",2,logger, " ");
* </pre>
*
* <p>
* Some immediate ways to modify this program to your needs are:
* <ul>
* <li>To run on different formats of queries and judgements provide your own
* {@link org.apache.lucene.benchmark.quality.Judge Judge} and
* {@link org.apache.lucene.benchmark.quality.QualityQuery Quality queries}.</li>
* <li>Create sophisticated Lucene queries by supplying a different
* {@link org.apache.lucene.benchmark.quality.QualityQueryParser Quality query parser}.</li>
* </ul>
*/
package org.apache.lucene.benchmark.quality;

View File

@ -1,83 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
<h2>Search Quality Benchmarking.</h2>
<p>
This package allows to benchmark search quality of a Lucene application.
<p>
In order to use this package you should provide:
<ul>
<li>A <a href="{@docRoot}/../core/org/apache/lucene/search/IndexSearcher.html">IndexSearcher</a>.</li>
<li><a href="QualityQuery.html">Quality queries</a>.</li>
<li><a href="Judge.html">Judging object</a>.</li>
<li><a href="utils/SubmissionReport.html">Reporting object</a>.</li>
</ul>
<p>
For benchmarking TREC collections with TREC QRels, take a look at the
<a href="trec/package-summary.html">trec package</a>.
<p>
Here is a sample code used to run the TREC 2006 queries 701-850 on the .Gov2 collection:
<pre class="prettyprint">
File topicsFile = new File("topics-701-850.txt");
File qrelsFile = new File("qrels-701-850.txt");
IndexReader ir = DirectoryReader.open(directory):
IndexSearcher searcher = new IndexSearcher(ir);
int maxResults = 1000;
String docNameField = "docname";
PrintWriter logger = new PrintWriter(System.out,true);
// use trec utilities to read trec topics into quality queries
TrecTopicsReader qReader = new TrecTopicsReader();
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile)));
// prepare judge, with trec utilities that read from a QRels file
Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile)));
// validate topics & judgments match each other
judge.validateData(qqs, logger);
// set the parsing of quality queries into Lucene queries.
QualityQueryParser qqParser = new SimpleQQParser("title", "body");
// run the benchmark
QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);
SubmissionReport submitLog = null;
QualityStats stats[] = qrun.execute(maxResults, judge, submitLog, logger);
// print an avarage sum of the results
QualityStats avg = QualityStats.average(stats);
avg.log("SUMMARY",2,logger, " ");
</pre>
<p>
Some immediate ways to modify this program to your needs are:
<ul>
<li>To run on different formats of queries and judgements provide your own
<a href="Judge.html">Judge</a> and
<a href="QualityQuery.html">Quality queries</a>.</li>
<li>Create sophisticated Lucene queries by supplying a different
<a href="QualityQueryParser.html">Quality query parser</a>.</li>
</ul>
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Utilities for Trec related quality benchmarking, feeding from Trec Topics and QRels inputs.
*/
package org.apache.lucene.benchmark.quality.trec;

View File

@ -1,23 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Utilities for Trec related quality benchmarking, feeding from Trec Topics and QRels inputs.
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Miscellaneous utilities for search quality benchmarking: query parsing, submission reports.
*/
package org.apache.lucene.benchmark.quality.utils;

View File

@ -1,23 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Miscellaneous utilities for search quality benchmarking: query parsing, submission reports.
</body>
</html>

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Benchmark Utility functions.
*/
package org.apache.lucene.benchmark.utils;

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Benchmark Utility functions.
</body>
</html>