From 14ea836c0b95fe64a32ea6a99a895c2be2bd6d6b Mon Sep 17 00:00:00 2001
From: Steven Rowe
Date: Mon, 28 Jan 2013 17:18:48 +0000
Subject: [PATCH] LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable
analyzer creation via the resulting factories using NewAnalyzerTask.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1439510 13f79535-47bb-0310-9956-ffa450edef68
---
lucene/CHANGES.txt | 3 +
lucene/benchmark/conf/shingle.alg | 28 +-
lucene/benchmark/scripts/shingle.bm2jira.pl | 2 +-
.../lucene/benchmark/byTask/PerfRunData.java | 9 +-
.../byTask/tasks/AnalyzerFactoryTask.java | 459 ++++++++++++++++++
.../byTask/tasks/NewAnalyzerTask.java | 103 ++--
.../byTask/tasks/NewShingleAnalyzerTask.java | 117 -----
.../benchmark/byTask/tasks/PerfTask.java | 10 +
.../benchmark/byTask/utils/Algorithm.java | 68 ++-
.../byTask/utils/AnalyzerFactory.java | 132 +++++
.../benchmark/byTask/TestPerfTasksLogic.java | 87 ++--
.../test-mapping-ISOLatin1Accent-partial.txt | 30 ++
12 files changed, 843 insertions(+), 205 deletions(-)
create mode 100644 lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java
delete mode 100644 lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
create mode 100644 lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java
create mode 100644 lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b36d3972c89..0bbf862d887 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -79,6 +79,9 @@ New Features
near-real-time reader is opened that contains those changes.
(Robert Muir, Mike McCandless)
+* LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable analyzer
+ creation via the resulting factories using NewAnalyzerTask. (Steve Rowe)
+
API Changes
* LUCENE-4709: FacetResultNode no longer has a residue field. (Shai Erera)
diff --git a/lucene/benchmark/conf/shingle.alg b/lucene/benchmark/conf/shingle.alg
index 5fb68763c38..b0744341c76 100644
--- a/lucene/benchmark/conf/shingle.alg
+++ b/lucene/benchmark/conf/shingle.alg
@@ -19,25 +19,43 @@ doc.body.tokenized=true
docs.dir=reuters-out
log.step=1000
+-AnalyzerFactory(name:shingle-bigrams-unigrams,
+ StandardTokenizer,
+ ShingleFilter(maxShingleSize:2, outputUnigrams:true))
+
+-AnalyzerFactory(name:shingle-bigrams,
+ StandardTokenizer,
+ ShingleFilter(maxShingleSize:2, outputUnigrams:false))
+
+-AnalyzerFactory(name:shingle-4grams-unigrams,
+ StandardTokenizer,
+ ShingleFilter(maxShingleSize:4, outputUnigrams:true))
+
+-AnalyzerFactory(name:shingle-4grams,
+ StandardTokenizer,
+ ShingleFilter(maxShingleSize:4, outputUnigrams:false))
+
+-AnalyzerFactory(name:standard-tokenizer-only, StandardTokenizer)
+
{ "Rounds"
- -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:true)
+ -NewAnalyzer(shingle-bigrams-unigrams)
-ResetInputs
{ "BigramsAndUnigrams" { ReadTokens > : 10000 }
- -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:false)
+ -NewAnalyzer(shingle-bigrams)
-ResetInputs
{ "BigramsOnly" { ReadTokens > : 10000 }
- -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:true)
+ -NewAnalyzer(shingle-4grams-unigrams)
-ResetInputs
{ "FourgramsAndUnigrams" { ReadTokens > : 10000 }
- -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:false)
+ -NewAnalyzer(shingle-4grams)
-ResetInputs
{ "FourgramsOnly" { ReadTokens > : 10000 }
- -NewAnalyzer(standard.StandardAnalyzer)
+ -NewAnalyzer(standard-tokenizer-only)
-ResetInputs
{ "UnigramsOnly" { ReadTokens > : 10000 }
diff --git a/lucene/benchmark/scripts/shingle.bm2jira.pl b/lucene/benchmark/scripts/shingle.bm2jira.pl
index ce6d1936f67..728dc9b8045 100644
--- a/lucene/benchmark/scripts/shingle.bm2jira.pl
+++ b/lucene/benchmark/scripts/shingle.bm2jira.pl
@@ -51,7 +51,7 @@ while (<>) {
# Print out platform info
print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
-if ($^O =~ /win/i) {
+if ($^O =~ /(?Directory, Writer, Reader.
* Taxonomy Directory, Writer, Reader.
* DocMaker, FacetSource and a few instances of QueryMaker.
+ * Named AnalysisFactories.
* Analyzer.
* Statistics data which updated during the run.
*
@@ -78,6 +81,7 @@ public class PerfRunData implements Closeable {
// directory, analyzer, docMaker - created at startup.
// reader, writer, searcher - maintained by basic tasks.
private Directory directory;
+ private Map analyzerFactories = new HashMap();
private Analyzer analyzer;
private DocMaker docMaker;
private ContentSource contentSource;
@@ -358,7 +362,7 @@ public class PerfRunData implements Closeable {
}
/**
- * @return Returns the anlyzer.
+ * @return Returns the analyzer.
*/
public Analyzer getAnalyzer() {
return analyzer;
@@ -434,4 +438,7 @@ public class PerfRunData implements Closeable {
return qm;
}
+ public Map getAnalyzerFactories() {
+ return analyzerFactories;
+ }
}
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java
new file mode 100644
index 00000000000..d69fe64a413
--- /dev/null
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java
@@ -0,0 +1,459 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.FilesystemResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory;
+import org.apache.lucene.util.Version;
+
+import java.io.File;
+import java.io.StreamTokenizer;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+/**
+ * Analyzer factory construction task. The name given to the constructed factory may
+ * be given to NewAnalyzerTask, which will call AnalyzerFactory.create().
+ *
+ * Params are in the form argname:argvalue or argname:"argvalue" or argname:'argvalue';
+ * use backslashes to escape '"' or "'" inside a quoted value when it's used as the enclosing
+ * quotation mark,
+ *
+ * Specify params in a comma separated list of the following, in order:
+ *
+ * - Analyzer args:
+ *
+ * - Required:
name:analyzer-factory-name
+ * - Optional: positionIncrementGap:int value (default: 0)
+ * - Optional: offsetGap:int value (default: 1)
+ *
+ *
+ * - zero or more CharFilterFactory's, followed by
+ * - exactly one TokenizerFactory, followed by
+ * - zero or more TokenFilterFactory's
+ *
+ *
+ * Each component analysis factory map specify luceneMatchVersion (defaults to
+ * {@link Version#LUCENE_CURRENT}) and any of the args understood by the specified
+ * *Factory class, in the above-describe param format.
+ *
+ * Example:
+ *
+ * -AnalyzerFactory(name:'strip html, fold to ascii, whitespace tokenize, max 10k tokens',
+ * positionIncrementGap:100,
+ * HTMLStripCharFilter,
+ * MappingCharFilter(mapping:'mapping-FoldToASCII.txt'),
+ * WhitespaceTokenizer(luceneMatchVersion:LUCENE_42),
+ * TokenLimitFilter(maxTokenCount:10000, consumeAllTokens:false))
+ * [...]
+ * -NewAnalyzer('strip html, fold to ascii, whitespace tokenize, max 10k tokens')
+ *
+ *
+ * AnalyzerFactory will direct analysis component factories to look for resources
+ * under the directory specified in the "work.dir" property.
+ */
+public class AnalyzerFactoryTask extends PerfTask {
+ private static final String LUCENE_ANALYSIS_PACKAGE_PREFIX = "org.apache.lucene.analysis.";
+ private static final Pattern ANALYSIS_COMPONENT_SUFFIX_PATTERN
+ = Pattern.compile("(?s:(?:(?:Token|Char)?Filter|Tokenizer)(?:Factory)?)$");
+ private static final Pattern TRAILING_DOT_ZERO_PATTERN = Pattern.compile("\\.0$");
+
+ private enum ArgType {ANALYZER_ARG, ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER, TOKENFILTER }
+
+ String factoryName = null;
+ Integer positionIncrementGap = null;
+ Integer offsetGap = null;
+ private List charFilterFactories = new ArrayList();
+ private TokenizerFactory tokenizerFactory = null;
+ private List tokenFilterFactories = new ArrayList();
+
+ public AnalyzerFactoryTask(PerfRunData runData) {
+ super(runData);
+ }
+
+ @Override
+ public int doLogic() {
+ return 1;
+ }
+
+ /**
+ * Sets the params.
+ * Analysis component factory names may optionally include the "Factory" suffix.
+ *
+ * @param params analysis pipeline specification: name, (optional) positionIncrementGap,
+ * (optional) offsetGap, 0+ CharFilterFactory's, 1 TokenizerFactory,
+ * and 0+ TokenFilterFactory's
+ */
+ @Override
+ public void setParams(String params) {
+ super.setParams(params);
+ ArgType expectedArgType = ArgType.ANALYZER_ARG;
+
+ final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
+ stok.commentChar('#');
+ stok.quoteChar('"');
+ stok.quoteChar('\'');
+ stok.eolIsSignificant(false);
+ stok.ordinaryChar('(');
+ stok.ordinaryChar(')');
+ stok.ordinaryChar(':');
+ stok.ordinaryChar(',');
+ try {
+ while (stok.nextToken() != StreamTokenizer.TT_EOF) {
+ switch (stok.ttype) {
+ case ',': {
+ // Do nothing
+ break;
+ }
+ case StreamTokenizer.TT_WORD: {
+ if (expectedArgType.equals(ArgType.ANALYZER_ARG)) {
+ final String argName = stok.sval;
+ if ( ! argName.equalsIgnoreCase("name")
+ && ! argName.equalsIgnoreCase("positionIncrementGap")
+ && ! argName.equalsIgnoreCase("offsetGap")) {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Missing 'name' param to AnalyzerFactory: '" + params + "'");
+ }
+ stok.nextToken();
+ if (stok.ttype != ':') {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory");
+ }
+
+ stok.nextToken();
+ String argValue = stok.sval;
+ switch (stok.ttype) {
+ case StreamTokenizer.TT_NUMBER: {
+ argValue = Double.toString(stok.nval);
+ // Drop the ".0" from numbers, for integer arguments
+ argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
+ // Intentional fallthrough
+ }
+ case '"':
+ case '\'':
+ case StreamTokenizer.TT_WORD: {
+ if (argName.equalsIgnoreCase("name")) {
+ factoryName = argValue;
+ expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER;
+ } else {
+ int intArgValue = 0;
+ try {
+ intArgValue = Integer.parseInt(argValue);
+ } catch (NumberFormatException e) {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + argValue + "'", e);
+ }
+ if (argName.equalsIgnoreCase("positionIncrementGap")) {
+ positionIncrementGap = intArgValue;
+ } else if (argName.equalsIgnoreCase("offsetGap")) {
+ offsetGap = intArgValue;
+ }
+ }
+ break;
+ }
+ case StreamTokenizer.TT_EOF: {
+ throw new RuntimeException("Unexpected EOF: " + stok.toString());
+ }
+ default: {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
+ }
+ }
+ } else if (expectedArgType.equals(ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER)) {
+ final String argName = stok.sval;
+
+ if (argName.equalsIgnoreCase("positionIncrementGap")
+ || argName.equalsIgnoreCase("offsetGap")) {
+ stok.nextToken();
+ if (stok.ttype != ':') {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory");
+ }
+ stok.nextToken();
+ int intArgValue = (int)stok.nval;
+ switch (stok.ttype) {
+ case '"':
+ case '\'':
+ case StreamTokenizer.TT_WORD: {
+ intArgValue = 0;
+ try {
+ intArgValue = Integer.parseInt(stok.sval.trim());
+ } catch (NumberFormatException e) {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + stok.sval + "'", e);
+ }
+ // Intentional fall-through
+ }
+ case StreamTokenizer.TT_NUMBER: {
+ if (argName.equalsIgnoreCase("positionIncrementGap")) {
+ positionIncrementGap = intArgValue;
+ } else if (argName.equalsIgnoreCase("offsetGap")) {
+ offsetGap = intArgValue;
+ }
+ break;
+ }
+ case StreamTokenizer.TT_EOF: {
+ throw new RuntimeException("Unexpected EOF: " + stok.toString());
+ }
+ default: {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
+ }
+ }
+ break;
+ }
+ try {
+ final Class extends CharFilterFactory> clazz;
+ clazz = lookupAnalysisClass(argName, CharFilterFactory.class);
+ createAnalysisPipelineComponent(stok, clazz);
+ } catch (IllegalArgumentException e) {
+ try {
+ final Class extends TokenizerFactory> clazz;
+ clazz = lookupAnalysisClass(argName, TokenizerFactory.class);
+ createAnalysisPipelineComponent(stok, clazz);
+ expectedArgType = ArgType.TOKENFILTER;
+ } catch (IllegalArgumentException e2) {
+ throw new RuntimeException("Line #" + lineno(stok) + ": Can't find class '"
+ + argName + "' as CharFilterFactory or TokenizerFactory");
+ }
+ }
+ } else { // expectedArgType = ArgType.TOKENFILTER
+ final String className = stok.sval;
+ final Class extends TokenFilterFactory> clazz;
+ try {
+ clazz = lookupAnalysisClass(className, TokenFilterFactory.class);
+ } catch (IllegalArgumentException e) {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Can't find class '" + className + "' as TokenFilterFactory");
+ }
+ createAnalysisPipelineComponent(stok, clazz);
+ }
+ break;
+ }
+ default: {
+ throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
+ }
+ }
+ }
+ } catch (RuntimeException e) {
+ if (e.getMessage().startsWith("Line #")) {
+ throw e;
+ } else {
+ throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
+ }
+ } catch (Throwable t) {
+ throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
+ }
+
+ final AnalyzerFactory analyzerFactory = new AnalyzerFactory
+ (charFilterFactories, tokenizerFactory, tokenFilterFactories);
+ analyzerFactory.setPositionIncrementGap(positionIncrementGap);
+ analyzerFactory.setOffsetGap(offsetGap);
+ getRunData().getAnalyzerFactories().put(factoryName, analyzerFactory);
+ }
+
+ /**
+ * Instantiates the given analysis factory class after pulling params from
+ * the given stream tokenizer, then stores the result in the appropriate
+ * pipeline component list.
+ *
+ * @param stok stream tokenizer from which to draw analysis factory params
+ * @param clazz analysis factory class to instantiate
+ */
+ private void createAnalysisPipelineComponent
+ (StreamTokenizer stok, Class extends AbstractAnalysisFactory> clazz) {
+ final AbstractAnalysisFactory instance;
+ try {
+ instance = clazz.newInstance();
+ } catch (Exception e) {
+ throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
+ }
+ Version luceneMatchVersion = null;
+ Map argMap = new HashMap();
+ boolean parenthetical = false;
+ try {
+ WHILE_LOOP: while (stok.nextToken() != StreamTokenizer.TT_EOF) {
+ switch (stok.ttype) {
+ case ',': {
+ if (parenthetical) {
+ // Do nothing
+ break;
+ } else {
+ // Finished reading this analysis factory configuration
+ break WHILE_LOOP;
+ }
+ }
+ case '(': {
+ if (parenthetical) {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Unexpected opening parenthesis.");
+ }
+ parenthetical = true;
+ break;
+ }
+ case ')': {
+ if (parenthetical) {
+ parenthetical = false;
+ } else {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Unexpected closing parenthesis.");
+ }
+ break;
+ }
+ case StreamTokenizer.TT_WORD: {
+ if ( ! parenthetical) {
+ throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token '" + stok.sval + "'");
+ }
+ String argName = stok.sval;
+ stok.nextToken();
+ if (stok.ttype != ':') {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to " + clazz.getSimpleName());
+ }
+ stok.nextToken();
+ String argValue = stok.sval;
+ switch (stok.ttype) {
+ case StreamTokenizer.TT_NUMBER: {
+ argValue = Double.toString(stok.nval);
+ // Drop the ".0" from numbers, for integer arguments
+ argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
+ // Intentional fall-through
+ }
+ case '"':
+ case '\'':
+ case StreamTokenizer.TT_WORD: {
+ if (argName.equalsIgnoreCase("luceneMatchVersion")) {
+ try {
+ luceneMatchVersion = Version.parseLeniently(argValue);
+ } catch (IllegalArgumentException e) {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Unrecognized luceneMatchVersion '" + argValue + "'", e);
+ }
+ } else {
+ argMap.put(argName, argValue);
+ }
+ break;
+ }
+ case StreamTokenizer.TT_EOF: {
+ throw new RuntimeException("Unexpected EOF: " + stok.toString());
+ }
+ default: {
+ throw new RuntimeException
+ ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
+ }
+ }
+ }
+ }
+ }
+
+ instance.setLuceneMatchVersion
+ (null == luceneMatchVersion ? Version.LUCENE_CURRENT : luceneMatchVersion);
+ instance.init(argMap);
+ if (instance instanceof ResourceLoaderAware) {
+ File baseDir = new File(getRunData().getConfig().get("work.dir", "work")).getAbsoluteFile();
+ ((ResourceLoaderAware)instance).inform(new FilesystemResourceLoader(baseDir));
+ }
+ if (CharFilterFactory.class.isAssignableFrom(clazz)) {
+ charFilterFactories.add((CharFilterFactory)instance);
+ } else if (TokenizerFactory.class.isAssignableFrom(clazz)) {
+ tokenizerFactory = (TokenizerFactory)instance;
+ } else if (TokenFilterFactory.class.isAssignableFrom(clazz)) {
+ tokenFilterFactories.add((TokenFilterFactory)instance);
+ }
+ } catch (RuntimeException e) {
+ if (e.getMessage().startsWith("Line #")) {
+ throw (e);
+ } else {
+ throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
+ }
+ } catch (Throwable t) {
+ throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
+ }
+ }
+
+ /**
+ * This method looks up a class with its fully qualified name (FQN), or a short-name
+ * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis."
+ * as the package prefix (e.g. "standard.ClassicTokenizerFactory" ->
+ * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory").
+ *
+ * If className contains a period, the class is first looked up as-is, assuming that it
+ * is an FQN. If this fails, lookup is retried after prepending the Lucene analysis
+ * package prefix to the class name.
+ *
+ * If className does not contain a period, the analysis SPI *Factory.lookupClass()
+ * methods are used to find the class.
+ *
+ * @param className The name or the short name of the class.
+ * @param expectedType The superclass className is expected to extend
+ * @return the loaded class.
+ * @throws ClassNotFoundException if lookup fails
+ */
+ public Class extends T> lookupAnalysisClass(String className, Class expectedType)
+ throws ClassNotFoundException {
+ if (className.contains(".")) {
+ try {
+ // First, try className == FQN
+ return Class.forName(className).asSubclass(expectedType);
+ } catch (ClassNotFoundException e) {
+ try {
+ // Second, retry lookup after prepending the Lucene analysis package prefix
+ return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType);
+ } catch (ClassNotFoundException e1) {
+ throw new ClassNotFoundException("Can't find class '" + className
+ + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'");
+ }
+ }
+ }
+ // No dot - use analysis SPI lookup
+ final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst("");
+ if (CharFilterFactory.class.isAssignableFrom(expectedType)) {
+ return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
+ } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) {
+ return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
+ } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) {
+ return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
+ }
+
+ throw new ClassNotFoundException("Can't find class '" + className + "'");
+ }
+
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams()
+ */
+ @Override
+ public boolean supportsParams() {
+ return true;
+ }
+
+ /** Returns the current line in the algorithm file */
+ public int lineno(StreamTokenizer stok) {
+ return getAlgLineNum() + stok.lineno();
+ }
+}
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java
index 8ce123e160f..08543944b20 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java
@@ -16,10 +16,16 @@ package org.apache.lucene.benchmark.byTask.tasks;
*/
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory;
import org.apache.lucene.util.Version;
import java.io.IOException;
+import java.io.StreamTokenizer;
+import java.io.StringReader;
import java.util.*;
import java.lang.reflect.Constructor;
@@ -28,12 +34,12 @@ import java.lang.reflect.Constructor;
*
*/
public class NewAnalyzerTask extends PerfTask {
- private List analyzerClassNames;
+ private List analyzerNames;
private int current;
public NewAnalyzerTask(PerfRunData runData) {
super(runData);
- analyzerClassNames = new ArrayList();
+ analyzerNames = new ArrayList();
}
public static final Analyzer createAnalyzer(String className) throws Exception{
@@ -50,55 +56,98 @@ public class NewAnalyzerTask extends PerfTask {
@Override
public int doLogic() throws IOException {
- String className = null;
+ String analyzerName = null;
try {
- if (current >= analyzerClassNames.size()) {
+ if (current >= analyzerNames.size()) {
current = 0;
}
- className = analyzerClassNames.get(current++);
+ analyzerName = analyzerNames.get(current++);
Analyzer analyzer = null;
- if (null == className || 0 == className.length()) {
- className = "org.apache.lucene.analysis.standard.StandardAnalyzer";
+ if (null == analyzerName || 0 == analyzerName.length()) {
+ analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer";
}
- if (-1 == className.indexOf(".")) {
- try {
- // If no package, first attempt to instantiate a core analyzer
- String coreClassName = "org.apache.lucene.analysis.core." + className;
- analyzer = createAnalyzer(coreClassName);
- className = coreClassName;
- } catch (ClassNotFoundException e) {
- // If not a core analyzer, try the base analysis package
- className = "org.apache.lucene.analysis." + className;
- analyzer = createAnalyzer(className);
- }
+ // First, lookup analyzerName as a named analyzer factory
+ AnalyzerFactory factory = getRunData().getAnalyzerFactories().get(analyzerName);
+ if (null != factory) {
+ analyzer = factory.create();
} else {
- if (className.startsWith("standard.")) {
- className = "org.apache.lucene.analysis." + className;
+ if (analyzerName.contains(".")) {
+ if (analyzerName.startsWith("standard.")) {
+ analyzerName = "org.apache.lucene.analysis." + analyzerName;
+ }
+ analyzer = createAnalyzer(analyzerName);
+ } else { // No package
+ try {
+ // Attempt to instantiate a core analyzer
+ String coreClassName = "org.apache.lucene.analysis.core." + analyzerName;
+ analyzer = createAnalyzer(coreClassName);
+ analyzerName = coreClassName;
+ } catch (ClassNotFoundException e) {
+ // If not a core analyzer, try the base analysis package
+ analyzerName = "org.apache.lucene.analysis." + analyzerName;
+ analyzer = createAnalyzer(analyzerName);
+ }
}
- analyzer = createAnalyzer(className);
}
getRunData().setAnalyzer(analyzer);
- System.out.println("Changed Analyzer to: " + className);
} catch (Exception e) {
- throw new RuntimeException("Error creating Analyzer: " + className, e);
+ throw new RuntimeException("Error creating Analyzer: " + analyzerName, e);
}
return 1;
}
/**
- * Set the params (analyzerClassName only), Comma-separate list of Analyzer class names. If the Analyzer lives in
+ * Set the params (analyzerName only), Comma-separate list of Analyzer class names. If the Analyzer lives in
* org.apache.lucene.analysis, the name can be shortened by dropping the o.a.l.a part of the Fully Qualified Class Name.
*
+ * Analyzer names may also refer to previously defined AnalyzerFactory's.
+ *
* Example Declaration: {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) >
+ *
+ * Example AnalyzerFactory usage:
+ *
+ * -AnalyzerFactory(name:'whitespace tokenized',WhitespaceTokenizer)
+ * -NewAnalyzer('whitespace tokenized')
+ *
* @param params analyzerClassName, or empty for the StandardAnalyzer
*/
@Override
public void setParams(String params) {
super.setParams(params);
- for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) {
- String s = tokenizer.nextToken();
- analyzerClassNames.add(s.trim());
+ final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
+ stok.quoteChar('"');
+ stok.quoteChar('\'');
+ stok.eolIsSignificant(false);
+ stok.ordinaryChar(',');
+ try {
+ while (stok.nextToken() != StreamTokenizer.TT_EOF) {
+ switch (stok.ttype) {
+ case ',': {
+ // Do nothing
+ break;
+ }
+ case '\'':
+ case '\"':
+ case StreamTokenizer.TT_WORD: {
+ analyzerNames.add(stok.sval);
+ break;
+ }
+ default: {
+ throw new RuntimeException("Unexpected token: " + stok.toString());
+ }
+ }
+ }
+ } catch (RuntimeException e) {
+ if (e.getMessage().startsWith("Line #")) {
+ throw e;
+ } else {
+ throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", e);
+ }
+ } catch (Throwable t) {
+ throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", t);
}
+
+
}
/* (non-Javadoc)
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
deleted file mode 100644
index 3d42b18cb99..00000000000
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
+++ /dev/null
@@ -1,117 +0,0 @@
-package org.apache.lucene.benchmark.byTask.tasks;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.StringTokenizer;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
-import org.apache.lucene.analysis.shingle.ShingleFilter;
-import org.apache.lucene.benchmark.byTask.PerfRunData;
-
-/**
- * Task to support benchmarking ShingleFilter / ShingleAnalyzerWrapper
- *
- *
- * -
NewShingleAnalyzer
(constructs with all defaults)
- * -
NewShingleAnalyzer(analyzer:o.a.l.analysis.StandardAnalyzer,maxShingleSize:2,outputUnigrams:true)
- *
- *
- */
-public class NewShingleAnalyzerTask extends PerfTask {
-
- private String analyzerClassName = "standard.StandardAnalyzer";
- private int maxShingleSize = 2;
- private boolean outputUnigrams = true;
-
- public NewShingleAnalyzerTask(PerfRunData runData) {
- super(runData);
- }
-
- private void setAnalyzer() throws Exception {
- Analyzer wrappedAnalyzer = null;
- if (null == analyzerClassName || 0 == analyzerClassName.length()) {
- analyzerClassName = "org.apache.lucene.analysis.standard.StandardAnalyzer";
- }
- if (-1 == analyzerClassName.indexOf(".")) {
- String coreClassName = "org.apache.lucene.analysis.core." + analyzerClassName;
- try {
- // If there is no package, first attempt to instantiate a core analyzer
- wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(coreClassName);
- analyzerClassName = coreClassName;
- } catch (ClassNotFoundException e) {
- // If this is not a core analyzer, try the base analysis package
- analyzerClassName = "org.apache.lucene.analysis." + analyzerClassName;
- wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(analyzerClassName);
- }
- } else {
- if (analyzerClassName.startsWith("standard.")) {
- analyzerClassName = "org.apache.lucene.analysis." + analyzerClassName;
- }
- wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(analyzerClassName);
- }
-
- ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
- wrappedAnalyzer,
- ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
- maxShingleSize,
- ShingleFilter.TOKEN_SEPARATOR,
- outputUnigrams,
- false);
- getRunData().setAnalyzer(analyzer);
- }
-
- @Override
- public int doLogic() throws Exception {
- try {
- setAnalyzer();
- System.out.println
- ("Changed Analyzer to: ShingleAnalyzerWrapper, wrapping ShingleFilter over "
- + analyzerClassName);
- } catch (Exception e) {
- throw new RuntimeException("Error creating Analyzer", e);
- }
- return 1;
- }
-
- @Override
- public void setParams(String params) {
- super.setParams(params);
- StringTokenizer st = new StringTokenizer(params, ",");
- while (st.hasMoreTokens()) {
- String param = st.nextToken();
- StringTokenizer expr = new StringTokenizer(param, ":");
- String key = expr.nextToken();
- String value = expr.nextToken();
- if (key.equalsIgnoreCase("analyzer")) {
- analyzerClassName = value;
- } else if (key.equalsIgnoreCase("outputUnigrams")) {
- outputUnigrams = Boolean.parseBoolean(value);
- } else if (key.equalsIgnoreCase("maxShingleSize")) {
- maxShingleSize = (int)Double.parseDouble(value);
- } else {
- throw new RuntimeException("Unknown parameter " + param);
- }
- }
- }
-
- @Override
- public boolean supportsParams() {
- return true;
- }
-}
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
index 4af1d4df06d..d7a39ffd01b 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
@@ -62,6 +62,9 @@ public abstract class PerfTask implements Cloneable {
private boolean runInBackground;
private int deltaPri;
+ // The first line of this task's definition in the alg file
+ private int algLineNum = 0;
+
protected static final String NEW_LINE = System.getProperty("line.separator");
/** Should not be used externally */
@@ -317,4 +320,11 @@ public abstract class PerfTask implements Cloneable {
this.disableCounting = disableCounting;
}
+ public void setAlgLineNum(int algLineNum) {
+ this.algLineNum = algLineNum;
+ }
+
+ public int getAlgLineNum() {
+ return algLineNum;
+ }
}
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
index fec744e10ca..ef9092c4561 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
@@ -58,11 +58,12 @@ public class Algorithm {
StreamTokenizer stok = new StreamTokenizer(new StringReader(algTxt));
stok.commentChar('#');
stok.eolIsSignificant(false);
- stok.ordinaryChar('"');
+ stok.quoteChar('"');
+ stok.quoteChar('\'');
stok.ordinaryChar('/');
stok.ordinaryChar('(');
stok.ordinaryChar(')');
- boolean colonOk = false;
+ boolean colonOk = false;
boolean isDisableCountNextTask = false; // only for primitive tasks
currSequence.setDepth(0);
@@ -74,6 +75,7 @@ public class Algorithm {
Constructor extends PerfTask> cnstr = taskClass(config,s)
.asSubclass(PerfTask.class).getConstructor(PerfRunData.class);
PerfTask task = cnstr.newInstance(runData);
+ task.setAlgLineNum(stok.lineno());
task.setDisableCounting(isDisableCountNextTask);
isDisableCountNextTask = false;
currSequence.addTask(task);
@@ -90,24 +92,54 @@ public class Algorithm {
if (stok.ttype!='(') {
stok.pushBack();
} else {
- // get params, for tasks that supports them, - anything until next ')'
+ // get params, for tasks that supports them - allow recursive parenthetical expressions
+ stok.eolIsSignificant(true); // Allow params tokenizer to keep track of line number
StringBuilder params = new StringBuilder();
stok.nextToken();
- while (stok.ttype!=')') {
- switch (stok.ttype) {
- case StreamTokenizer.TT_NUMBER:
- params.append(stok.nval);
- break;
- case StreamTokenizer.TT_WORD:
- params.append(stok.sval);
- break;
- case StreamTokenizer.TT_EOF:
- throw new Exception("unexpexted EOF: - "+stok.toString());
- default:
- params.append((char)stok.ttype);
+ if (stok.ttype != ')') {
+ int count = 1;
+ BALANCED_PARENS: while (true) {
+ switch (stok.ttype) {
+ case StreamTokenizer.TT_NUMBER: {
+ params.append(stok.nval);
+ break;
+ }
+ case StreamTokenizer.TT_WORD: {
+ params.append(stok.sval);
+ break;
+ }
+ case StreamTokenizer.TT_EOF: {
+ throw new RuntimeException("Unexpexted EOF: - "+stok.toString());
+ }
+ case '"':
+ case '\'': {
+ params.append((char)stok.ttype);
+ // re-escape delimiters, if any
+ params.append(stok.sval.replaceAll("" + (char)stok.ttype, "\\\\" + (char)stok.ttype));
+ params.append((char)stok.ttype);
+ break;
+ }
+ case '(': {
+ params.append((char)stok.ttype);
+ ++count;
+ break;
+ }
+ case ')': {
+ if (--count >= 1) { // exclude final closing parenthesis
+ params.append((char)stok.ttype);
+ } else {
+ break BALANCED_PARENS;
+ }
+ break;
+ }
+ default: {
+ params.append((char)stok.ttype);
+ }
+ }
+ stok.nextToken();
}
- stok.nextToken();
}
+ stok.eolIsSignificant(false);
String prm = params.toString().trim();
if (prm.length()>0) {
task.setParams(prm);
@@ -182,10 +214,8 @@ public class Algorithm {
if (stok.ttype!='"') {
stok.pushBack();
} else {
- stok.nextToken();
name = stok.sval;
- stok.nextToken();
- if (stok.ttype!='"' || name==null || name.length()==0) {
+ if (stok.ttype!='"' || name==null || name.length()==0) {
throw new Exception("sequence name problem - "+stok.toString());
}
}
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java
new file mode 100644
index 00000000000..da1de943106
--- /dev/null
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java
@@ -0,0 +1,132 @@
+package org.apache.lucene.benchmark.byTask.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+
+import java.io.Reader;
+import java.util.List;
+
+/**
+ * A factory to create an analyzer.
+ * See {@link org.apache.lucene.benchmark.byTask.tasks.AnalyzerFactoryTask}
+ */
+public final class AnalyzerFactory {
+ final private List charFilterFactories;
+ final private TokenizerFactory tokenizerFactory;
+ final private List tokenFilterFactories;
+ private String name = null;
+ private Integer positionIncrementGap = null;
+ private Integer offsetGap = null;
+
+ public AnalyzerFactory(List charFilterFactories,
+ TokenizerFactory tokenizerFactory,
+ List tokenFilterFactories) {
+ this.charFilterFactories = charFilterFactories;
+ assert null != tokenizerFactory;
+ this.tokenizerFactory = tokenizerFactory;
+ this.tokenFilterFactories = tokenFilterFactories;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public void setPositionIncrementGap(Integer positionIncrementGap) {
+ this.positionIncrementGap = positionIncrementGap;
+ }
+
+ public void setOffsetGap(Integer offsetGap) {
+ this.offsetGap = offsetGap;
+ }
+
+ public Analyzer create() {
+ return new Analyzer() {
+ private final Integer positionIncrementGap = AnalyzerFactory.this.positionIncrementGap;
+ private final Integer offsetGap = AnalyzerFactory.this.offsetGap;
+
+ @Override
+ public Reader initReader(String fieldName, Reader reader) {
+ if (charFilterFactories != null && charFilterFactories.size() > 0) {
+ Reader wrappedReader = reader;
+ for (CharFilterFactory charFilterFactory : charFilterFactories) {
+ wrappedReader = charFilterFactory.create(wrappedReader);
+ }
+ reader = wrappedReader;
+ }
+ return reader;
+ }
+
+ @Override
+ protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ final Tokenizer tokenizer = tokenizerFactory.create(reader);
+ TokenStream tokenStream = tokenizer;
+ for (TokenFilterFactory filterFactory : tokenFilterFactories) {
+ tokenStream = filterFactory.create(tokenStream);
+ }
+ return new TokenStreamComponents(tokenizer, tokenStream);
+ }
+
+ @Override
+ public int getPositionIncrementGap(String fieldName) {
+ return null == positionIncrementGap ? super.getPositionIncrementGap(fieldName) : positionIncrementGap;
+ }
+
+ @Override
+ public int getOffsetGap(String fieldName) {
+ return null == offsetGap ? super.getOffsetGap(fieldName) : offsetGap;
+ }
+ };
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder("AnalyzerFactory(");
+ if (null != name) {
+ sb.append("name:");
+ sb.append(name);
+ sb.append(", ");
+ }
+ if (null != positionIncrementGap) {
+ sb.append("positionIncrementGap:");
+ sb.append(positionIncrementGap);
+ sb.append(", ");
+ }
+ if (null != offsetGap) {
+ sb.append("offsetGap:");
+ sb.append(offsetGap);
+ sb.append(", ");
+ }
+ for (CharFilterFactory charFilterFactory: charFilterFactories) {
+ sb.append(charFilterFactory);
+ sb.append(", ");
+ }
+ sb.append(tokenizerFactory);
+ for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
+ sb.append(", ");
+ sb.append(tokenFilterFactory);
+ }
+ sb.append(')');
+ return sb.toString();
+ }
+}
diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
index 0cc49e32033..1a201b69698 100755
--- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
+++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
@@ -71,6 +71,7 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
public void setUp() throws Exception {
super.setUp();
copyToWorkDir("reuters.first20.lines.txt");
+ copyToWorkDir("test-mapping-ISOLatin1Accent-partial.txt");
}
/**
@@ -1020,63 +1021,79 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
}
/**
- * Test that we can create ShingleAnalyzerWrappers.
+ * Test that we can create shingle analyzers using AnalyzerFactory.
*/
public void testShingleAnalyzer() throws Exception {
String text = "one,two,three, four five six";
- // Default analyzer, maxShingleSize, and outputUnigrams
- Benchmark benchmark = execBenchmark(getShingleConfig(""));
+ // StandardTokenizer, maxShingleSize, and outputUnigrams
+ Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
+ ("shingle-analyzer", "StandardTokenizer,ShingleFilter"));
benchmark.getRunData().getAnalyzer().tokenStream
("bogus", new StringReader(text)).close();
- assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
- new String[] {"one", "one two", "two", "two three",
- "three", "three four", "four", "four five",
- "five", "five six", "six"});
- // Default analyzer, maxShingleSize = 3, and outputUnigrams = false
+ BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one", "one two", "two", "two three",
+ "three", "three four", "four", "four five",
+ "five", "five six", "six" });
+ // StandardTokenizer, maxShingleSize = 3, and outputUnigrams = false
benchmark = execBenchmark
- (getShingleConfig("maxShingleSize:3,outputUnigrams:false"));
- assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
- new String[] { "one two", "one two three", "two three",
- "two three four", "three four",
- "three four five", "four five",
- "four five six", "five six" });
- // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
+ (getAnalyzerFactoryConfig
+ ("shingle-analyzer",
+ "StandardTokenizer,ShingleFilter(maxShingleSize:3,outputUnigrams:false)"));
+ BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one two", "one two three", "two three",
+ "two three four", "three four",
+ "three four five", "four five",
+ "four five six", "five six" });
+ // WhitespaceTokenizer, default maxShingleSize and outputUnigrams
benchmark = execBenchmark
- (getShingleConfig("analyzer:WhitespaceAnalyzer"));
- assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
- new String[] { "one,two,three,", "one,two,three, four",
- "four", "four five", "five", "five six",
- "six" });
+ (getAnalyzerFactoryConfig("shingle-analyzer", "WhitespaceTokenizer,ShingleFilter"));
+ BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one,two,three,", "one,two,three, four",
+ "four", "four five", "five", "five six",
+ "six" });
- // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
+ // WhitespaceTokenizer, maxShingleSize=3 and outputUnigrams=false
benchmark = execBenchmark
- (getShingleConfig
- ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
- assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
- new String[] { "one,two,three, four",
- "one,two,three, four five",
- "four five", "four five six",
- "five six" });
+ (getAnalyzerFactoryConfig
+ ("shingle-factory",
+ "WhitespaceTokenizer,ShingleFilter(outputUnigrams:false,maxShingleSize:3)"));
+ BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one,two,three, four",
+ "one,two,three, four five",
+ "four five", "four five six",
+ "five six" });
}
- private void assertEqualShingle
- (Analyzer analyzer, String text, String[] expected) throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, text, expected);
- }
-
- private String[] getShingleConfig(String params) {
+ private String[] getAnalyzerFactoryConfig(String name, String params) {
+ final String singleQuoteEscapedName = name.replaceAll("'", "\\\\'");
String algLines[] = {
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
+ "work.dir=" + getWorkDir().getAbsolutePath(),
"content.source.forever=false",
"directory=RAMDirectory",
- "NewShingleAnalyzer(" + params + ")",
+ "AnalyzerFactory(name:'" + singleQuoteEscapedName + "', " + params + ")",
+ "NewAnalyzer('" + singleQuoteEscapedName + "')",
"CreateIndex",
"{ \"AddDocs\" AddDoc > : * "
};
return algLines;
}
+
+ public void testAnalyzerFactory() throws Exception {
+ String text = "Fortieth, Quarantième, Cuadragésimo";
+ Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
+ ("ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'",
+ "positionIncrementGap:100,offsetGap:1111,"
+ +"MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt'),"
+ +"PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\"),"
+ +"StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)"));
+ BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "fo", "or", "rt", "ti", "ie", "et", "th",
+ "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix", "xx", "xx", "xe",
+ "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs", "si", "io"});
+ }
private String getReuters20LinesFile() {
return getWorkDirResourcePath("reuters.first20.lines.txt");
diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt
new file mode 100644
index 00000000000..0ff17dbbe3f
--- /dev/null
+++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt
@@ -0,0 +1,30 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Syntax:
+# "source" => "target"
+# "source".length() > 0 (source cannot be empty.)
+# "target".length() >= 0 (target can be empty.)
+
+# example:
+# "À" => "A"
+# "\u00C0" => "A"
+# "\u00C0" => "\u0041"
+# "ß" => "ss"
+# "\t" => " "
+# "\n" => ""
+
+# è => e
+"\u00E8" => "e"
+
+# é => e
+"\u00E9" => "e"