LUCENE-2844: spatial benchmark

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1536176 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2013-10-27 18:17:45 +00:00
parent e45d67083e
commit bc1b8ac507
9 changed files with 523 additions and 6 deletions

View File

@ -24,6 +24,7 @@
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" scope="TEST" module-name="benchmark-conf" />
<orderEntry type="module" module-name="spatial" />
<orderEntry type="module" module-name="facet" />
<orderEntry type="module" module-name="highlighter" />
<orderEntry type="module" module-name="icu" />

View File

@ -11,7 +11,7 @@
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="module-library">
<orderEntry type="module-library" exported="">
<library>
<CLASSES>
<root url="file://$MODULE_DIR$/lib" />

View File

@ -79,6 +79,11 @@
<artifactId>lucene-facet</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>lucene-spatial</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>

View File

@ -37,6 +37,8 @@
<available file="temp/20news-18828.tar.gz" property="20news-18828.exists"/>
<available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
<available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
<available file="temp/allCountries.txt.bz2" property="geonames.exists"/>
<available file="${working.dir}/geonames" property="geonames.expanded"/>
<available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
<available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
@ -62,6 +64,25 @@
<bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
</target>
<target name="geonames-files" depends="check-files">
<mkdir dir="temp"/>
<antcall target="get-geonames"/>
<antcall target="expand-geonames"/>
</target>
<target name="get-geonames" unless="geonames.exists">
<!-- note: latest data is at: http://download.geonames.org/export/dump/allCountries.zip
and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
and then compress with: bzip2 -9 -k file_random.txt -->
<get src="http://people.apache.org/~dsmiley/data/geonames_20130921_randomOrder_allCountries.txt.bz2"
dest="temp/allCountries.txt.bz2"/>
</target>
<target name="expand-geonames" unless="geonames.expanded">
<mkdir dir="${working.dir}/geonames"/>
<bunzip2 src="temp/allCountries.txt.bz2" dest="${working.dir}/geonames"/>
</target>
<target name="get-news-20" unless="20news-18828.exists">
<get src="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz"
dest="temp/news20.tar.gz"/>
@ -147,8 +168,10 @@
<pathelement path="${analyzers-common.jar}"/>
<pathelement path="${queryparser.jar}"/>
<pathelement path="${facet.jar}"/>
<pathelement path="${spatial.jar}"/>
<pathelement path="${queries.jar}"/>
<fileset dir="${common.dir}/analysis/icu/lib"/>
<fileset dir="${common.dir}/spatial/lib"/>
<path refid="base.classpath"/>
<fileset dir="lib"/>
</path>
@ -158,7 +181,8 @@
<pathelement path="${benchmark.ext.classpath}"/>
</path>
<target name="javadocs" depends="javadocs-memory,javadocs-highlighter,javadocs-analyzers-common,javadocs-queryparser,javadocs-facet,compile-core">
<target name="javadocs" depends="javadocs-memory,javadocs-highlighter,javadocs-analyzers-common,
javadocs-queryparser,javadocs-facet,javadocs-spatial,compile-core">
<invoke-module-javadoc>
<links>
<link href="../memory"/>
@ -166,6 +190,7 @@
<link href="../analyzers-common"/>
<link href="../queryparser"/>
<link href="../facet"/>
<link href="../spatial"/>
</links>
</invoke-module-javadoc>
</target>
@ -256,7 +281,7 @@
</ant>
</target>
<target name="init" depends="module-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
<target name="init" depends="module-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet,jar-spatial"/>
<target name="compile-test" depends="copy-alg-files-for-testing,module-build.compile-test"/>
<target name="copy-alg-files-for-testing" description="copy .alg files as resources for testing">

View File

@ -0,0 +1,111 @@
#/**
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
# -------------------------------------------------------------------------------------
# Spatial search benchmark
# In order to use this, you'll need to first run 'ant geonames-files'.
# You may need more memory when running this: -Dtask.mem=1000M
# For docs on what options are available, see the javadocs.
### Spatial Context, Grid, Strategy config
doc.maker=org.apache.lucene.benchmark.byTask.feeds.SpatialDocMaker
# SpatialContext: see SpatialContextFactory.makeSpatialContext
#spatial.spatialContextFactory=com.spatial4j.core.context.jts.JtsSpatialContextFactory
#spatial.geo=true
#spatial.distCalculator=haversine
#spatial.worldBounds=...
# Spatial Grid: (PrefixTree) see SpatialPrefixTreeFactory.makeSPT
#spatial.prefixTree=geohash (or quad)
#spatial.maxLevels=11
#spatial.maxDistErr (in degrees) to compute maxLevels -- defaults to 1 meter's worth
# RecursivePrefixTreeStrategy:
spatial.docPointsOnly=true
#spatial.distErrPct=.25
#spatial.prefixGridScanLevel=-4
### Source & Doc
content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
line.parser=org.apache.lucene.benchmark.byTask.feeds.GeonamesLineParser
docs.file=work/geonames/allCountries.txt
doc.tokenized=false
# Next 3 props convert doc points to circles with a random radius and then optionally bbox'es
#doc.spatial.radiusDegrees=0.0
#doc.spatial.radiusDegreesRandPlusMinus=0.0
#doc.spatial.bbox=false
### Directory
directory=FSDirectory
#directory=RamDirectory
compound=false
merge.factor=10
ram.flush.mb=64
concurrent.merge.scheduler.max.thread.count=2
### Query
query.maker=org.apache.lucene.benchmark.byTask.feeds.SpatialFileQueryMaker
query.file=work/geonames/allCountries.txt
query.file.line.parser=org.apache.lucene.benchmark.byTask.feeds.GeonamesLineParser
query.file.maxQueries=1000
# Next 3 props convert query points to circles with a random radius and then optionally bbox'es
query.spatial.radiusDegrees=0
query.spatial.radiusDegreesRandPlusMinus=3
query.spatial.bbox=false
query.spatial.score=false
#query.spatial.predicate=Intersects
# (defaults to spatial.distErrPct)
query.spatial.distErrPct=qDistErrPct:0.0:0.025:0.1:0.5
### Misc
log.step.AddDoc = 100000
task.max.depth.log=1
# -------------------------------------------------------------------------------------
{ "Populate"
ResetSystemErase
CreateIndex
#1 million docs
[{ "MAddDocs" AddDoc} : 250000] : 4
ForceMerge(1)
CommitIndex
CloseIndex
RepSumByPref MAddDocs
} : 1
#set above round to 0 on subsequent runs if not changing indexing but experimenting with search
OpenReader
{"WarmJIT" Search > : 4000
CloseReader
{ "Rounds"
ResetSystemSoft
OpenReader
Search
{"RealQueries" Search > : 2000
CloseReader
NewRound
} : 4
#RepSumByName
RepSumByPrefRound RealQueries

View File

@ -0,0 +1,44 @@
package org.apache.lucene.benchmark.byTask.feeds;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A line parser for Geonames.org data.
* See <a href="http://download.geonames.org/export/dump/readme.txt">'geoname' table</a>.
* Requires {@link SpatialDocMaker}.
*/
public class GeonamesLineParser extends LineDocSource.LineParser {
/** This header will be ignored; the geonames format is fixed and doesn't have a header line. */
public GeonamesLineParser(String[] header) {
super(header);
}
@Override
public void parseLine(DocData docData, String line) {
String[] parts = line.split("\\t", 7);//no more than first 6 fields needed
// Sample data line:
// 3578267, Morne du Vitet, Morne du Vitet, 17.88333, -62.8, ...
// ID, Name, Alternate name (unused), Lat, Lon, ...
docData.setID(Integer.parseInt(parts[0]));//note: overwrites ID assigned by LineDocSource
docData.setName(parts[1]);
docData.setBody(parts[4]+","+parts[5]); // latitude , longitude
}
}

View File

@ -0,0 +1,207 @@
package org.apache.lucene.benchmark.byTask.feeds;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.spatial4j.core.context.SpatialContext;
import com.spatial4j.core.context.SpatialContextFactory;
import com.spatial4j.core.shape.Point;
import com.spatial4j.core.shape.Shape;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.spatial.SpatialStrategy;
import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy;
import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTreeFactory;
import java.util.AbstractMap;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.Set;
/**
* Indexes spatial data according to a configured {@link SpatialStrategy} with optional
* shape transformation via a configured {@link ShapeConverter}. The converter can turn points into
* circles and bounding boxes, in order to vary the type of indexing performance tests.
* Unless it's subclass-ed to do otherwise, this class configures a {@link SpatialContext},
* {@link SpatialPrefixTree}, and {@link RecursivePrefixTreeStrategy}. The Strategy is made
* available to a query maker via the static method {@link #getSpatialStrategy(int)}.
* See spatial.alg for a listing of spatial parameters, in particular those starting with "spatial."
* and "doc.spatial".
*/
public class SpatialDocMaker extends DocMaker {
public static final String SPATIAL_FIELD = "spatial";
//cache spatialStrategy by round number
private static Map<Integer,SpatialStrategy> spatialStrategyCache = new HashMap<Integer,SpatialStrategy>();
private SpatialStrategy strategy;
private ShapeConverter shapeConverter;
/**
* Looks up the SpatialStrategy from the given round --
* {@link org.apache.lucene.benchmark.byTask.utils.Config#getRoundNumber()}. It's an error
* if it wasn't created already for this round -- when SpatialDocMaker is initialized.
*/
public static SpatialStrategy getSpatialStrategy(int roundNumber) {
SpatialStrategy result = spatialStrategyCache.get(roundNumber);
if (result == null) {
throw new IllegalStateException("Strategy should have been init'ed by SpatialDocMaker by now");
}
return result;
}
/**
* Builds a SpatialStrategy from configuration options.
*/
protected SpatialStrategy makeSpatialStrategy(final Config config) {
//A Map view of Config that prefixes keys with "spatial."
Map<String, String> configMap = new AbstractMap<String, String>() {
@Override
public Set<Entry<String, String>> entrySet() {
throw new UnsupportedOperationException();
}
@Override
public String get(Object key) {
return config.get("spatial." + key, null);
}
};
SpatialContext ctx = SpatialContextFactory.makeSpatialContext(configMap, null);
//Some day the strategy might be initialized with a factory but such a factory
// is non-existent.
return makeSpatialStrategy(config, configMap, ctx);
}
protected SpatialStrategy makeSpatialStrategy(final Config config, Map<String, String> configMap,
SpatialContext ctx) {
//A factory for the prefix tree grid
SpatialPrefixTree grid = SpatialPrefixTreeFactory.makeSPT(configMap, null, ctx);
RecursivePrefixTreeStrategy strategy = new RecursivePrefixTreeStrategy(grid, SPATIAL_FIELD) {
{
//protected field
this.pointsOnly = config.get("spatial.docPointsOnly", false);
}
};
int prefixGridScanLevel = config.get("query.spatial.prefixGridScanLevel", -4);
if (prefixGridScanLevel < 0)
prefixGridScanLevel = grid.getMaxLevels() + prefixGridScanLevel;
strategy.setPrefixGridScanLevel(prefixGridScanLevel);
double distErrPct = config.get("spatial.distErrPct", .025);//doc & query; a default
strategy.setDistErrPct(distErrPct);
return strategy;
}
@Override
public void setConfig(Config config, ContentSource source) {
super.setConfig(config, source);
SpatialStrategy existing = spatialStrategyCache.get(config.getRoundNumber());
if (existing == null) {
//new round; we need to re-initialize
strategy = makeSpatialStrategy(config);
spatialStrategyCache.put(config.getRoundNumber(), strategy);
//TODO remove previous round config?
shapeConverter = makeShapeConverter(strategy, config, "doc.spatial.");
System.out.println("Spatial Strategy: " + strategy);
}
}
/**
* Optionally converts points to circles, and optionally bbox'es result.
*/
public static ShapeConverter makeShapeConverter(final SpatialStrategy spatialStrategy,
Config config, String configKeyPrefix) {
//by default does no conversion
final double radiusDegrees = config.get(configKeyPrefix+"radiusDegrees", 0.0);
final double plusMinus = config.get(configKeyPrefix+"radiusDegreesRandPlusMinus", 0.0);
final boolean bbox = config.get(configKeyPrefix + "bbox", false);
return new ShapeConverter() {
@Override
public Shape convert(Shape shape) {
if (shape instanceof Point && (radiusDegrees != 0.0 || plusMinus != 0.0)) {
Point point = (Point)shape;
double radius = radiusDegrees;
if (plusMinus > 0.0) {
Random random = new Random(point.hashCode());//use hashCode so it's reproducibly random
radius += random.nextDouble() * 2 * plusMinus - plusMinus;
radius = Math.abs(radius);//can happen if configured plusMinus > radiusDegrees
}
shape = spatialStrategy.getSpatialContext().makeCircle(point, radius);
}
if (bbox)
shape = shape.getBoundingBox();
return shape;
}
};
}
/** Converts one shape to another. Created by
* {@link #makeShapeConverter(org.apache.lucene.spatial.SpatialStrategy, org.apache.lucene.benchmark.byTask.utils.Config, String)} */
public interface ShapeConverter {
Shape convert(Shape shape);
}
@Override
public Document makeDocument() throws Exception {
DocState docState = getDocState();
Document doc = super.makeDocument();
// Set SPATIAL_FIELD from body
DocData docData = docState.docData;
// makeDocument() resets docState.getBody() so we can't look there; look in Document
String shapeStr = doc.getField(DocMaker.BODY_FIELD).stringValue();
Shape shape = makeShapeFromString(strategy, docData.getName(), shapeStr);
if (shape != null) {
shape = shapeConverter.convert(shape);
//index
for (Field f : strategy.createIndexableFields(shape)) {
doc.add(f);
}
}
return doc;
}
public static Shape makeShapeFromString(SpatialStrategy strategy, String name, String shapeStr) {
if (shapeStr != null && shapeStr.length() > 0) {
try {
return strategy.getSpatialContext().readShape(shapeStr);
} catch (Exception e) {//InvalidShapeException TODO
System.err.println("Shape "+name+" wasn't parseable: "+e+" (skipping it)");
return null;
}
}
return null;
}
@Override
public Document makeDocument(int size) throws Exception {
//TODO consider abusing the 'size' notion to number of shapes per document
throw new UnsupportedOperationException();
}
}

View File

@ -0,0 +1,120 @@
package org.apache.lucene.benchmark.byTask.feeds;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.spatial4j.core.shape.Shape;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.queries.CustomScoreQuery;
import org.apache.lucene.queries.function.FunctionQuery;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.spatial.SpatialStrategy;
import org.apache.lucene.spatial.query.SpatialArgs;
import org.apache.lucene.spatial.query.SpatialOperation;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
* Reads spatial data from the body field docs from an internally created {@link LineDocSource}.
* It's parsed by {@link com.spatial4j.core.context.SpatialContext#readShape(String)} and then
* further manipulated via a configurable {@link SpatialDocMaker.ShapeConverter}. When using point
* data, it's likely you'll want to configure the shape converter so that the query shapes actually
* cover a region. The queries are all created & cached in advance. This query maker works in
* conjunction with {@link SpatialDocMaker}. See spatial.alg for a listing of options, in
* particular the options starting with "query.".
*/
public class SpatialFileQueryMaker extends AbstractQueryMaker {
protected SpatialStrategy strategy;
protected double distErrPct;//NaN if not set
protected SpatialOperation operation;
protected boolean score;
protected SpatialDocMaker.ShapeConverter shapeConverter;
@Override
public void setConfig(Config config) throws Exception {
strategy = SpatialDocMaker.getSpatialStrategy(config.getRoundNumber());
shapeConverter = SpatialDocMaker.makeShapeConverter(strategy, config, "query.spatial.");
distErrPct = config.get("query.spatial.distErrPct", Double.NaN);
operation = SpatialOperation.get(config.get("query.spatial.predicate", "Intersects"));
score = config.get("query.spatial.score", false);
super.setConfig(config);//call last, will call prepareQueries()
}
@Override
protected Query[] prepareQueries() throws Exception {
final int maxQueries = config.get("query.file.maxQueries", 1000);
Config srcConfig = new Config(new Properties());
srcConfig.set("docs.file", config.get("query.file", null));
srcConfig.set("line.parser", config.get("query.file.line.parser", null));
srcConfig.set("content.source.forever", "false");
List<Query> queries = new ArrayList<>();
LineDocSource src = new LineDocSource();
try {
src.setConfig(srcConfig);
src.resetInputs();
DocData docData = new DocData();
for (int i = 0; i < maxQueries; i++) {
docData = src.getNextDocData(docData);
Shape shape = SpatialDocMaker.makeShapeFromString(strategy, docData.getName(), docData.getBody());
if (shape != null) {
shape = shapeConverter.convert(shape);
queries.add(makeQueryFromShape(shape));
} else {
i--;//skip
}
}
} catch (NoMoreDataException e) {
//all-done
} finally {
src.close();
}
return queries.toArray(new Query[queries.size()]);
}
protected Query makeQueryFromShape(Shape shape) {
SpatialArgs args = new SpatialArgs(operation, shape);
if (!Double.isNaN(distErrPct))
args.setDistErrPct(distErrPct);
if (score) {
ValueSource valueSource = strategy.makeDistanceValueSource(shape.getCenter());
return new CustomScoreQuery(strategy.makeQuery(args), new FunctionQuery(valueSource));
} else {
//strategy.makeQuery() could potentially score (isn't well defined) so instead we call
// makeFilter() and wrap
Filter filter = strategy.makeFilter(args);
if (filter instanceof QueryWrapperFilter) {
return ((QueryWrapperFilter)filter).getQuery();
} else {
return new ConstantScoreQuery(filter);
}
}
}
}

View File

@ -469,9 +469,10 @@ regular index/search work tasks, report tasks, and control tasks.
its own queryMaker instance.
<li>
<font color="#FF0066">CommitIndex</font> and
<font color="#FF0066">Optimize</font> can be used to commit
changes to the index and/or optimize the index created thus
far.
<font color="#FF0066">ForceMerge</font> can be used to commit
changes to the index then merge the index segments. The integer
parameter specifies how many segments to merge down to (default
1).
<li>
<font color="#FF0066">WriteLineDoc</font> prepares a 'line'
file where each line holds a document with <i>title</i>,
@ -593,6 +594,9 @@ Here is a list of currently defined properties:
<ul><li>doc.delete.step
</li></ul>
</li>
<li><b>Spatial</b>: Numerous; see spatial.alg
</li>
<li><b>Task alternative packages</b>:
<ul><li>alt.tasks.packages