2007-04-17 03:11:04 -04:00
|
|
|
<?xml version="1.0"?>
|
2011-05-26 19:32:56 -04:00
|
|
|
|
|
|
|
<!--
|
|
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
|
|
this work for additional information regarding copyright ownership.
|
|
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
the "License"); you may not use this file except in compliance with
|
|
|
|
the License. You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
-->
|
|
|
|
|
2007-04-17 03:11:04 -04:00
|
|
|
<project name="benchmark" default="default">
|
|
|
|
|
|
|
|
<description>
|
|
|
|
Lucene Benchmarking Contributions
|
|
|
|
</description>
|
|
|
|
|
2011-01-04 07:28:10 -05:00
|
|
|
<property name="build.dir" location="build/" />
|
|
|
|
<property name="dist.dir" location="dist/" />
|
2011-01-22 20:42:19 -05:00
|
|
|
<property name="maven.dist.dir" location="../dist/maven" />
|
2011-01-04 07:28:10 -05:00
|
|
|
|
|
|
|
<import file="../../lucene/contrib/contrib-build.xml"/>
|
2008-06-10 07:58:00 -04:00
|
|
|
<property name="working.dir" location="work"/>
|
2007-04-17 03:11:04 -04:00
|
|
|
|
2010-04-11 08:18:16 -04:00
|
|
|
<!-- the tests have some parallel problems -->
|
2010-08-24 09:20:15 -04:00
|
|
|
<property name="tests.threadspercpu" value="0"/>
|
2010-04-11 08:18:16 -04:00
|
|
|
|
2007-04-17 03:11:04 -04:00
|
|
|
<target name="check-files">
|
|
|
|
<available file="temp/news20.tar.gz" property="news20.exists"/>
|
|
|
|
|
|
|
|
<available file="${working.dir}/20_newsgroup" property="news20.expanded"/>
|
|
|
|
|
|
|
|
<available file="temp/reuters21578.tar.gz" property="reuters.exists"/>
|
|
|
|
<available file="${working.dir}/reuters" property="reuters.expanded"/>
|
|
|
|
<available file="${working.dir}/reuters-out" property="reuters.extracted"/>
|
|
|
|
<available file="temp/20news-18828.tar.gz" property="20news-18828.exists"/>
|
|
|
|
<available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
|
|
|
|
<available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
|
|
|
|
|
2007-06-30 22:19:10 -04:00
|
|
|
<available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
|
|
|
|
<available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
|
2007-08-09 04:57:26 -04:00
|
|
|
<available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>
|
2010-01-12 15:06:17 -05:00
|
|
|
<available file="temp/${top.100k.words.archive.filename}"
|
|
|
|
property="top.100k.words.archive.present"/>
|
|
|
|
<available file="${working.dir}/top100k-out"
|
|
|
|
property="top.100k.word.files.expanded"/>
|
2007-06-30 22:19:10 -04:00
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="enwiki-files" depends="check-files">
|
|
|
|
<mkdir dir="temp"/>
|
|
|
|
<antcall target="get-enwiki"/>
|
|
|
|
<antcall target="expand-enwiki"/>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="get-enwiki" unless="enwiki.exists">
|
|
|
|
<get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
|
|
|
|
dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="expand-enwiki" unless="enwiki.expanded">
|
|
|
|
<bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
|
|
|
|
</target>
|
|
|
|
|
2007-04-17 03:11:04 -04:00
|
|
|
<target name="get-news-20" unless="20news-18828.exists">
|
|
|
|
<get src="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz"
|
|
|
|
dest="temp/news20.tar.gz"/>
|
|
|
|
|
|
|
|
</target>
|
|
|
|
<target name="get-reuters" unless="reuters.exists">
|
|
|
|
|
|
|
|
<get src="http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
|
|
|
|
dest="temp/reuters21578.tar.gz"/>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="expand-news-20" unless="news20.expanded">
|
|
|
|
<gunzip src="temp/news20.tar.gz" dest="temp"/>
|
|
|
|
<untar src="temp/news20.tar" dest="${working.dir}"/>
|
|
|
|
</target>
|
|
|
|
<target name="expand-reuters" unless="reuters.expanded">
|
|
|
|
<gunzip src="temp/reuters21578.tar.gz" dest="temp"/>
|
|
|
|
<mkdir dir="${working.dir}/reuters"/>
|
|
|
|
<untar src="temp/reuters21578.tar" dest="${working.dir}/reuters"/>
|
|
|
|
<delete >
|
|
|
|
<fileset dir="${working.dir}/reuters">
|
|
|
|
<include name="*.txt"/>
|
|
|
|
</fileset>
|
|
|
|
</delete>
|
|
|
|
|
|
|
|
</target>
|
|
|
|
<target name="extract-reuters" depends="check-files" unless="reuters.extracted">
|
|
|
|
<java classname="org.apache.lucene.benchmark.utils.ExtractReuters" maxmemory="1024M" fork="true">
|
|
|
|
<classpath refid="run.classpath"/>
|
2008-06-10 07:58:00 -04:00
|
|
|
<arg file="${working.dir}/reuters"/>
|
|
|
|
<arg file="${working.dir}/reuters-out"/>
|
2007-04-17 03:11:04 -04:00
|
|
|
</java>
|
|
|
|
</target>
|
|
|
|
<target name="get-20news-18828" unless="20news-18828.exists">
|
|
|
|
<get src="http://people.csail.mit.edu/u/j/jrennie/public_html/20Newsgroups/20news-18828.tar.gz"
|
|
|
|
dest="temp/20news-18828.tar.gz"/>
|
|
|
|
|
|
|
|
</target>
|
|
|
|
<target name="expand-20news-18828" unless="20news-18828.expanded">
|
|
|
|
<gunzip src="temp/20news-18828.tar.gz" dest="temp"/>
|
|
|
|
<untar src="temp/20news-18828.tar" dest="${working.dir}"/>
|
|
|
|
</target>
|
|
|
|
<target name="get-mini-news" unless="mini.exists">
|
|
|
|
<get src="http://kdd.ics.uci.edu/databases/20newsgroups/mini_newsgroups.tar.gz"
|
|
|
|
dest="temp/mini_newsgroups.tar.gz"/>
|
|
|
|
</target>
|
|
|
|
<target name="expand-mini-news" unless="mini.expanded">
|
|
|
|
<gunzip src="temp/mini_newsgroups.tar.gz" dest="temp"/>
|
|
|
|
<untar src="temp/mini_newsgroups.tar" dest="${working.dir}"/>
|
|
|
|
</target>
|
|
|
|
|
2010-01-12 15:06:17 -05:00
|
|
|
<property name="top.100k.words.archive.filename"
|
|
|
|
value="top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"/>
|
|
|
|
<property name="top.100k.words.archive.base.url"
|
|
|
|
value="http://people.apache.org/~rmuir/wikipedia"/>
|
|
|
|
<target name="get-top-100k-words-archive" unless="top.100k.words.archive.present">
|
|
|
|
<mkdir dir="temp"/>
|
|
|
|
<get src="${top.100k.words.archive.base.url}/${top.100k.words.archive.filename}"
|
|
|
|
dest="temp/${top.100k.words.archive.filename}"/>
|
|
|
|
</target>
|
|
|
|
<target name="expand-top-100k-word-files" unless="top.100k.word.files.expanded">
|
|
|
|
<mkdir dir="${working.dir}/top100k-out"/>
|
|
|
|
<untar src="temp/${top.100k.words.archive.filename}"
|
|
|
|
overwrite="true" compression="bzip2" dest="${working.dir}/top100k-out"/>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="top-100k-wiki-word-files" depends="check-files">
|
|
|
|
<mkdir dir="${working.dir}"/>
|
|
|
|
<antcall target="get-top-100k-words-archive"/>
|
|
|
|
<antcall target="expand-top-100k-word-files"/>
|
|
|
|
</target>
|
|
|
|
|
2007-04-17 03:11:04 -04:00
|
|
|
<target name="get-files" depends="check-files">
|
|
|
|
<mkdir dir="temp"/>
|
|
|
|
<antcall target="get-reuters"/>
|
|
|
|
<antcall target="expand-reuters"/>
|
|
|
|
<antcall target="extract-reuters"/>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<path id="classpath">
|
2010-03-26 06:44:25 -04:00
|
|
|
<pathelement path="${memory.jar}"/>
|
|
|
|
<pathelement path="${highlighter.jar}"/>
|
|
|
|
<pathelement path="${analyzers-common.jar}"/>
|
2011-07-10 23:37:00 -04:00
|
|
|
<pathelement path="${queryparser.jar}"/>
|
2011-10-09 14:01:36 -04:00
|
|
|
<pathelement path="${facet.jar}"/>
|
2012-03-30 14:04:43 -04:00
|
|
|
<fileset dir="${common.dir}/../modules/analysis/icu/lib" includes="icu4j-4.8.1.1.jar"/>
|
2010-03-26 06:44:25 -04:00
|
|
|
<path refid="base.classpath"/>
|
2009-04-16 05:46:30 -04:00
|
|
|
<fileset dir="lib">
|
2012-03-30 14:04:43 -04:00
|
|
|
<include name="commons-compress-1.2.jar"/>
|
|
|
|
<include name="xercesImpl-2.9.1.jar"/>
|
2009-04-16 05:46:30 -04:00
|
|
|
</fileset>
|
2007-04-17 03:11:04 -04:00
|
|
|
</path>
|
|
|
|
<path id="run.classpath">
|
|
|
|
<path refid="classpath"/>
|
|
|
|
<pathelement location="${build.dir}/classes/java"/>
|
2009-12-22 12:07:00 -05:00
|
|
|
<pathelement path="${benchmark.ext.classpath}"/>
|
2007-04-17 03:11:04 -04:00
|
|
|
</path>
|
|
|
|
|
2008-06-10 07:58:00 -04:00
|
|
|
<property name="task.alg" location="conf/micro-standard.alg"/>
|
2007-04-17 03:11:04 -04:00
|
|
|
<property name="task.mem" value="140M"/>
|
|
|
|
|
2010-01-14 16:23:35 -05:00
|
|
|
<target name="run-task" depends="compile,check-files,get-files"
|
2008-06-10 07:58:00 -04:00
|
|
|
description="Run compound penalty perf test (optional: -Dtask.alg=your-algorithm-file -Dtask.mem=java-max-mem)">
|
2007-04-17 03:11:04 -04:00
|
|
|
<echo>Working Directory: ${working.dir}</echo>
|
|
|
|
<java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="${task.mem}" fork="true">
|
|
|
|
<classpath refid="run.classpath"/>
|
2008-06-10 07:58:00 -04:00
|
|
|
<arg file="${task.alg}"/>
|
2007-04-17 03:11:04 -04:00
|
|
|
</java>
|
|
|
|
</target>
|
|
|
|
|
2007-06-30 22:19:10 -04:00
|
|
|
<target name="enwiki" depends="compile,check-files,enwiki-files">
|
|
|
|
<echo>Working Directory: ${working.dir}</echo>
|
|
|
|
<java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="1024M" fork="true">
|
|
|
|
<assertions>
|
|
|
|
<enable/>
|
|
|
|
</assertions>
|
|
|
|
<classpath refid="run.classpath"/>
|
2008-06-10 07:58:00 -04:00
|
|
|
<arg file="conf/extractWikipedia.alg"/>
|
2007-06-30 22:19:10 -04:00
|
|
|
</java>
|
|
|
|
</target>
|
|
|
|
|
2010-01-12 15:06:17 -05:00
|
|
|
<property name="collation.alg.file" location="conf/collation.alg"/>
|
|
|
|
<property name="collation.output.file"
|
|
|
|
value="${working.dir}/collation.benchmark.output.txt"/>
|
|
|
|
<property name="collation.jira.output.file"
|
|
|
|
value="${working.dir}/collation.bm2jira.output.txt"/>
|
|
|
|
|
|
|
|
<path id="collation.runtime.classpath">
|
|
|
|
<path refid="run.classpath"/>
|
2010-05-20 06:46:00 -04:00
|
|
|
<pathelement path="${analyzers-icu.jar}"/>
|
2012-03-30 14:04:43 -04:00
|
|
|
<fileset dir="${common.dir}/../modules/analysis/icu/lib" includes="icu4j-4.8.1.1.jar"/>
|
2010-01-12 15:06:17 -05:00
|
|
|
</path>
|
|
|
|
|
2011-07-28 00:02:09 -04:00
|
|
|
<target name="collation" depends="compile,jar-analyzers-icu,top-100k-wiki-word-files">
|
2010-01-12 15:06:17 -05:00
|
|
|
<echo>Running contrib/benchmark with alg file: ${collation.alg.file}</echo>
|
|
|
|
<java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
|
|
|
|
maxmemory="${task.mem}" output="${collation.output.file}">
|
|
|
|
<classpath refid="collation.runtime.classpath"/>
|
|
|
|
<arg file="${collation.alg.file}"/>
|
|
|
|
</java>
|
|
|
|
<echo>Benchmark output is in file: ${collation.output.file}</echo>
|
|
|
|
<echo>Converting to JIRA table format...</echo>
|
|
|
|
<exec executable="perl" output="${collation.jira.output.file}" failonerror="true">
|
|
|
|
<arg value="scripts/collation.bm2jira.pl"/>
|
|
|
|
<arg value="${collation.output.file}"/>
|
|
|
|
</exec>
|
|
|
|
<echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
|
|
|
|
</target>
|
|
|
|
|
2010-01-28 23:07:47 -05:00
|
|
|
<property name="shingle.alg.file" location="conf/shingle.alg"/>
|
|
|
|
<property name="shingle.output.file"
|
|
|
|
value="${working.dir}/shingle.benchmark.output.txt"/>
|
|
|
|
<property name="shingle.jira.output.file"
|
|
|
|
value="${working.dir}/shingle.bm2jira.output.txt"/>
|
|
|
|
|
|
|
|
<path id="shingle.runtime.classpath">
|
|
|
|
<path refid="run.classpath"/>
|
|
|
|
</path>
|
|
|
|
|
2010-03-26 06:44:25 -04:00
|
|
|
<target name="shingle" depends="compile,get-files">
|
2010-01-28 23:07:47 -05:00
|
|
|
<echo>Running contrib/benchmark with alg file: ${shingle.alg.file}</echo>
|
|
|
|
<java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
|
|
|
|
maxmemory="${task.mem}" output="${shingle.output.file}">
|
|
|
|
<classpath refid="run.classpath"/>
|
|
|
|
<arg file="${shingle.alg.file}"/>
|
|
|
|
</java>
|
|
|
|
<echo>Benchmark output is in file: ${shingle.output.file}</echo>
|
|
|
|
<echo>Converting to JIRA table format...</echo>
|
|
|
|
<exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
|
|
|
|
<arg value="scripts/shingle.bm2jira.pl"/>
|
|
|
|
<arg value="${shingle.output.file}"/>
|
|
|
|
</exec>
|
|
|
|
<echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}</echo>
|
|
|
|
</target>
|
|
|
|
|
2012-03-30 14:04:43 -04:00
|
|
|
<!-- we don't actually need to compile this thing, we just want its lib -->
|
|
|
|
<target name="resolve-icu">
|
|
|
|
<ant dir="${common.dir}/../modules/analysis/icu/" target="resolve" inheritAll="false">
|
|
|
|
<propertyset refid="uptodate.and.compiled.properties"/>
|
|
|
|
</ant>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="init" depends="contrib-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
|
2011-01-20 22:44:13 -05:00
|
|
|
|
2011-02-18 23:49:36 -05:00
|
|
|
<target name="clean-javacc">
|
|
|
|
<fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
|
|
|
|
<containsregexp expression="Generated.*By.*JavaCC"/>
|
|
|
|
</fileset>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="javacc" depends="init,javacc-check" if="javacc.present">
|
|
|
|
<invoke-javacc target="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj"
|
|
|
|
outputDir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml"
|
|
|
|
/>
|
|
|
|
</target>
|
2011-02-19 12:06:01 -05:00
|
|
|
|
2012-04-01 15:46:49 -04:00
|
|
|
<target name="compile-test" depends="copy-alg-files-for-testing,contrib-build.compile-test"/>
|
|
|
|
<target name="copy-alg-files-for-testing" description="copy .alg files as resources for testing">
|
|
|
|
<copy todir="${build.dir}/classes/test/conf">
|
|
|
|
<fileset dir="conf"/>
|
|
|
|
</copy>
|
2012-02-15 09:46:05 -05:00
|
|
|
</target>
|
2007-04-17 03:11:04 -04:00
|
|
|
</project>
|