LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable analyzer creation via the resulting factories using NewAnalyzerTask.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1439510 13f79535-47bb-0310-9956-ffa450edef68
2013-01-28 17:18:48 +00:00 · 2013-01-28 17:18:48 +00:00 · 14ea836c0b
parent ce7be4dc65
commit 14ea836c0b
12 changed files with 843 additions and 205 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -79,6 +79,9 @@ New Features
  near-real-time reader is opened that contains those changes.
  (Robert Muir, Mike McCandless)
 * LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable analyzer
  creation via the resulting factories using NewAnalyzerTask.  (Steve Rowe)
 API Changes
 * LUCENE-4709: FacetResultNode no longer has a residue field. (Shai Erera)
--- a/lucene/benchmark/conf/shingle.alg
+++ b/lucene/benchmark/conf/shingle.alg
@ -19,25 +19,43 @@ doc.body.tokenized=true
 docs.dir=reuters-out
 log.step=1000
 -AnalyzerFactory(name:shingle-bigrams-unigrams,
                 StandardTokenizer,
                 ShingleFilter(maxShingleSize:2, outputUnigrams:true))
 -AnalyzerFactory(name:shingle-bigrams,
                 StandardTokenizer,
                 ShingleFilter(maxShingleSize:2, outputUnigrams:false))
 -AnalyzerFactory(name:shingle-4grams-unigrams,
                 StandardTokenizer,
                 ShingleFilter(maxShingleSize:4, outputUnigrams:true))
 -AnalyzerFactory(name:shingle-4grams,
                 StandardTokenizer,
                 ShingleFilter(maxShingleSize:4, outputUnigrams:false))
 -AnalyzerFactory(name:standard-tokenizer-only, StandardTokenizer)
 { "Rounds"
-    -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:true)
+    -NewAnalyzer(shingle-bigrams-unigrams)
    -ResetInputs
    { "BigramsAndUnigrams" { ReadTokens > : 10000 }
-    -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:false)
+    -NewAnalyzer(shingle-bigrams)
    -ResetInputs
    { "BigramsOnly" { ReadTokens > : 10000 }
-    -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:true)
+    -NewAnalyzer(shingle-4grams-unigrams)
    -ResetInputs
    { "FourgramsAndUnigrams" { ReadTokens > : 10000 }
-    -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:false)
+    -NewAnalyzer(shingle-4grams)
    -ResetInputs
    { "FourgramsOnly" { ReadTokens > : 10000 }
-    -NewAnalyzer(standard.StandardAnalyzer)
+    -NewAnalyzer(standard-tokenizer-only)
    -ResetInputs
    { "UnigramsOnly" { ReadTokens > : 10000 }
--- a/lucene/benchmark/scripts/shingle.bm2jira.pl
+++ b/lucene/benchmark/scripts/shingle.bm2jira.pl
@ -51,7 +51,7 @@ while (<>) {
 # Print out platform info
 print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
-if ($^O =~ /win/i) {
+if ($^O =~ /(?<!dar)win/i) {
  print "$^O\n";
  eval {
    require Win32;
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
@ -23,6 +23,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
@ -34,6 +35,7 @@ import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask;
 import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
 import org.apache.lucene.benchmark.byTask.tasks.ReadTask;
 import org.apache.lucene.benchmark.byTask.tasks.SearchTask;
 import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.FileUtils;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
@ -55,6 +57,7 @@ import org.apache.lucene.util.IOUtils;
 *  <li>Directory, Writer, Reader.
 *  <li>Taxonomy Directory, Writer, Reader.
 *  <li>DocMaker, FacetSource and a few instances of QueryMaker.
 *  <li>Named AnalysisFactories.
 *  <li>Analyzer.
 *  <li>Statistics data which updated during the run.
 * </ul>
@ -78,6 +81,7 @@ public class PerfRunData implements Closeable {
  // directory, analyzer, docMaker - created at startup.
  // reader, writer, searcher - maintained by basic tasks. 
  private Directory directory;
  private Map<String,AnalyzerFactory> analyzerFactories = new HashMap<String,AnalyzerFactory>();
  private Analyzer analyzer;
  private DocMaker docMaker;
  private ContentSource contentSource;
@ -358,7 +362,7 @@ public class PerfRunData implements Closeable {
  }
  /**
-   * @return Returns the anlyzer.
+   * @return Returns the analyzer.
   */
  public Analyzer getAnalyzer() {
    return analyzer;
@ -434,4 +438,7 @@ public class PerfRunData implements Closeable {
    return qm;
  }
  public Map<String,AnalyzerFactory> getAnalyzerFactories() {
    return analyzerFactories;
  }
 }
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java
@ -0,0 +1,459 @@
 package org.apache.lucene.benchmark.byTask.tasks;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
 import org.apache.lucene.analysis.util.CharFilterFactory;
 import org.apache.lucene.analysis.util.FilesystemResourceLoader;
 import org.apache.lucene.analysis.util.ResourceLoaderAware;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.TokenizerFactory;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory;
 import org.apache.lucene.util.Version;
 import java.io.File;
 import java.io.StreamTokenizer;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Pattern;
 /**
 * Analyzer factory construction task.  The name given to the constructed factory may
 * be given to NewAnalyzerTask, which will call AnalyzerFactory.create().
 *
 * Params are in the form argname:argvalue or argname:"argvalue" or argname:'argvalue';
 * use backslashes to escape '"' or "'" inside a quoted value when it's used as the enclosing
 * quotation mark,
 *
 * Specify params in a comma separated list of the following, in order:
 * <ol>
 *   <li>Analyzer args:
 *     <ul>
 *       <li><b>Required</b>: <code>name:<i>analyzer-factory-name</i></code></li>
 *       <li>Optional: <tt>positionIncrementGap:<i>int value</i></tt> (default: 0)</li>
 *       <li>Optional: <tt>offsetGap:<i>int value</i></tt> (default: 1)</li>
 *     </ul>
 *   </li>
 *   <li>zero or more CharFilterFactory's, followed by</li>
 *   <li>exactly one TokenizerFactory, followed by</li>
 *   <li>zero or more TokenFilterFactory's</li>
 * </ol>
 *
 * Each component analysis factory map specify <tt>luceneMatchVersion</tt> (defaults to
 * {@link Version#LUCENE_CURRENT}) and any of the args understood by the specified
 * *Factory class, in the above-describe param format.
 * <p/>
 * Example:
 * <pre>
 *     -AnalyzerFactory(name:'strip html, fold to ascii, whitespace tokenize, max 10k tokens',
 *                      positionIncrementGap:100,
 *                      HTMLStripCharFilter,
 *                      MappingCharFilter(mapping:'mapping-FoldToASCII.txt'),
 *                      WhitespaceTokenizer(luceneMatchVersion:LUCENE_42),
 *                      TokenLimitFilter(maxTokenCount:10000, consumeAllTokens:false))
 *     [...]
 *     -NewAnalyzer('strip html, fold to ascii, whitespace tokenize, max 10k tokens')
 * </pre>
 * <p/>
 * AnalyzerFactory will direct analysis component factories to look for resources
 * under the directory specified in the "work.dir" property.
 */
 public class AnalyzerFactoryTask extends PerfTask {
  private static final String LUCENE_ANALYSIS_PACKAGE_PREFIX = "org.apache.lucene.analysis.";
  private static final Pattern ANALYSIS_COMPONENT_SUFFIX_PATTERN
      = Pattern.compile("(?s:(?:(?:Token|Char)?Filter|Tokenizer)(?:Factory)?)$");
  private static final Pattern TRAILING_DOT_ZERO_PATTERN = Pattern.compile("\\.0$");
  private enum ArgType {ANALYZER_ARG, ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER, TOKENFILTER }
  String factoryName = null;
  Integer positionIncrementGap = null;
  Integer offsetGap = null;
  private List<CharFilterFactory> charFilterFactories = new ArrayList<CharFilterFactory>();
  private TokenizerFactory tokenizerFactory = null;
  private List<TokenFilterFactory> tokenFilterFactories = new ArrayList<TokenFilterFactory>();
  public AnalyzerFactoryTask(PerfRunData runData) {
    super(runData);
  }
  @Override
  public int doLogic() {
    return 1;
  }
  /**
   * Sets the params.
   * Analysis component factory names may optionally include the "Factory" suffix.
   *
   * @param params analysis pipeline specification: name, (optional) positionIncrementGap,
   *               (optional) offsetGap, 0+ CharFilterFactory's, 1 TokenizerFactory,
   *               and 0+ TokenFilterFactory's
   */
  @Override
  public void setParams(String params) {
    super.setParams(params);
    ArgType expectedArgType = ArgType.ANALYZER_ARG;
    final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
    stok.commentChar('#');
    stok.quoteChar('"');
    stok.quoteChar('\'');
    stok.eolIsSignificant(false);
    stok.ordinaryChar('(');
    stok.ordinaryChar(')');
    stok.ordinaryChar(':');
    stok.ordinaryChar(',');
    try {
      while (stok.nextToken() != StreamTokenizer.TT_EOF) {
        switch (stok.ttype) {
          case ',': {
            // Do nothing
            break;
          }
          case StreamTokenizer.TT_WORD: {
            if (expectedArgType.equals(ArgType.ANALYZER_ARG)) {
              final String argName = stok.sval;
              if ( ! argName.equalsIgnoreCase("name")
                  && ! argName.equalsIgnoreCase("positionIncrementGap")
                  && ! argName.equalsIgnoreCase("offsetGap")) {
                throw new RuntimeException
                    ("Line #" + lineno(stok) + ": Missing 'name' param to AnalyzerFactory: '" + params + "'");
              }
              stok.nextToken();
              if (stok.ttype != ':') {
                throw new RuntimeException
                    ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory");
              }
              stok.nextToken();
              String argValue = stok.sval;
              switch (stok.ttype) {
                case StreamTokenizer.TT_NUMBER: {
                  argValue = Double.toString(stok.nval);
                  // Drop the ".0" from numbers, for integer arguments
                  argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
                  // Intentional fallthrough
                }
                case '"':
                case '\'':
                case StreamTokenizer.TT_WORD: {
                  if (argName.equalsIgnoreCase("name")) {
                    factoryName = argValue;
                    expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER;
                  } else {
                    int intArgValue = 0;
                    try {
                      intArgValue = Integer.parseInt(argValue);
                    } catch (NumberFormatException e) {
                      throw new RuntimeException
                          ("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + argValue + "'", e);
                    }
                    if (argName.equalsIgnoreCase("positionIncrementGap")) {
                      positionIncrementGap = intArgValue;
                    } else if (argName.equalsIgnoreCase("offsetGap")) {
                      offsetGap = intArgValue;
                    }
                  }
                  break;
                }
                case StreamTokenizer.TT_EOF: {
                  throw new RuntimeException("Unexpected EOF: " + stok.toString());
                }
                default: {
                  throw new RuntimeException
                      ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
                }
              }
            } else if (expectedArgType.equals(ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER)) {
              final String argName = stok.sval;
              if (argName.equalsIgnoreCase("positionIncrementGap")
                  || argName.equalsIgnoreCase("offsetGap")) {
                stok.nextToken();
                if (stok.ttype != ':') {
                  throw new RuntimeException
                      ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory");
                }
                stok.nextToken();
                int intArgValue = (int)stok.nval;
                switch (stok.ttype) {
                  case '"':
                  case '\'':
                  case StreamTokenizer.TT_WORD: {
                    intArgValue = 0;
                    try {
                      intArgValue = Integer.parseInt(stok.sval.trim());
                    } catch (NumberFormatException e) {
                      throw new RuntimeException
                          ("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + stok.sval + "'", e);
                    }
                    // Intentional fall-through
                  }
                  case StreamTokenizer.TT_NUMBER: {
                    if (argName.equalsIgnoreCase("positionIncrementGap")) {
                      positionIncrementGap = intArgValue;
                    } else if (argName.equalsIgnoreCase("offsetGap")) {
                      offsetGap = intArgValue;
                    }
                    break;
                  }
                  case StreamTokenizer.TT_EOF: {
                    throw new RuntimeException("Unexpected EOF: " + stok.toString());
                  }
                  default: {
                    throw new RuntimeException
                        ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
                  }
                }
                break;
              }
              try {
                final Class<? extends CharFilterFactory> clazz;
                clazz = lookupAnalysisClass(argName, CharFilterFactory.class);
                createAnalysisPipelineComponent(stok, clazz);
              } catch (IllegalArgumentException e) {
                try {
                  final Class<? extends TokenizerFactory> clazz;
                  clazz = lookupAnalysisClass(argName, TokenizerFactory.class);
                  createAnalysisPipelineComponent(stok, clazz);
                  expectedArgType = ArgType.TOKENFILTER;
                } catch (IllegalArgumentException e2) {
                  throw new RuntimeException("Line #" + lineno(stok) + ": Can't find class '"
                                             + argName + "' as CharFilterFactory or TokenizerFactory");
                }
              }
            } else { // expectedArgType = ArgType.TOKENFILTER
              final String className = stok.sval;
              final Class<? extends TokenFilterFactory> clazz;
              try {
                clazz = lookupAnalysisClass(className, TokenFilterFactory.class);
              } catch (IllegalArgumentException e) {
                  throw new RuntimeException
                      ("Line #" + lineno(stok) + ": Can't find class '" + className + "' as TokenFilterFactory");
              }
              createAnalysisPipelineComponent(stok, clazz);
            }
            break;
          }
          default: {
            throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
          }
        }
      }
    } catch (RuntimeException e) {
      if (e.getMessage().startsWith("Line #")) {
        throw e;
      } else {
        throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
      }
    } catch (Throwable t) {
      throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
    }
    final AnalyzerFactory analyzerFactory = new AnalyzerFactory
        (charFilterFactories, tokenizerFactory, tokenFilterFactories);
    analyzerFactory.setPositionIncrementGap(positionIncrementGap);
    analyzerFactory.setOffsetGap(offsetGap);
    getRunData().getAnalyzerFactories().put(factoryName, analyzerFactory);
  }
  /**
   * Instantiates the given analysis factory class after pulling params from
   * the given stream tokenizer, then stores the result in the appropriate
   * pipeline component list.
   *
   * @param stok stream tokenizer from which to draw analysis factory params
   * @param clazz analysis factory class to instantiate
   */
  private void createAnalysisPipelineComponent
      (StreamTokenizer stok, Class<? extends AbstractAnalysisFactory> clazz) {
    final AbstractAnalysisFactory instance;
    try {
     instance = clazz.newInstance();
    } catch (Exception e) {
      throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
    }
    Version luceneMatchVersion = null;
    Map<String,String> argMap = new HashMap<String,String>();
    boolean parenthetical = false;
    try {
      WHILE_LOOP: while (stok.nextToken() != StreamTokenizer.TT_EOF) {
        switch (stok.ttype) {
          case ',': {
            if (parenthetical) {
              // Do nothing
              break;
            } else {
              // Finished reading this analysis factory configuration
              break WHILE_LOOP;
            }
          }
          case '(': {
            if (parenthetical) {
              throw new RuntimeException
                  ("Line #" + lineno(stok) + ": Unexpected opening parenthesis.");
            }
            parenthetical = true;
            break;
          }
          case ')': {
            if (parenthetical) {
              parenthetical = false;
            } else {
              throw new RuntimeException
                  ("Line #" + lineno(stok) + ": Unexpected closing parenthesis.");
            }
            break;
          }
          case StreamTokenizer.TT_WORD: {
            if ( ! parenthetical) {
              throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token '" + stok.sval + "'");
            }
            String argName = stok.sval;
            stok.nextToken();
            if (stok.ttype != ':') {
              throw new RuntimeException
                  ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to " + clazz.getSimpleName());
            }
            stok.nextToken();
            String argValue = stok.sval;
            switch (stok.ttype) {
              case StreamTokenizer.TT_NUMBER: {
                  argValue = Double.toString(stok.nval);
                  // Drop the ".0" from numbers, for integer arguments
                  argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
                  // Intentional fall-through
              }
              case '"':
              case '\'':
              case StreamTokenizer.TT_WORD: {
                if (argName.equalsIgnoreCase("luceneMatchVersion")) {
                  try {
                    luceneMatchVersion = Version.parseLeniently(argValue);
                  } catch (IllegalArgumentException e) {
                    throw new RuntimeException
                        ("Line #" + lineno(stok) + ": Unrecognized luceneMatchVersion '" + argValue + "'", e);
                  }
                } else {
                  argMap.put(argName, argValue);
                }
                break;
              }
              case StreamTokenizer.TT_EOF: {
                throw new RuntimeException("Unexpected EOF: " + stok.toString());
              }
              default: {
                throw new RuntimeException
                    ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
              }
            }
          }
        }
      }
      instance.setLuceneMatchVersion
          (null == luceneMatchVersion ? Version.LUCENE_CURRENT : luceneMatchVersion);
      instance.init(argMap);
      if (instance instanceof ResourceLoaderAware) {
        File baseDir = new File(getRunData().getConfig().get("work.dir", "work")).getAbsoluteFile();
        ((ResourceLoaderAware)instance).inform(new FilesystemResourceLoader(baseDir));
      }
      if (CharFilterFactory.class.isAssignableFrom(clazz)) {
        charFilterFactories.add((CharFilterFactory)instance);
      } else if (TokenizerFactory.class.isAssignableFrom(clazz)) {
        tokenizerFactory = (TokenizerFactory)instance;
      } else if (TokenFilterFactory.class.isAssignableFrom(clazz)) {
        tokenFilterFactories.add((TokenFilterFactory)instance);
      }
    } catch (RuntimeException e) {
      if (e.getMessage().startsWith("Line #")) {
        throw (e);
      } else {
        throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
      }
    } catch (Throwable t) {
      throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
    }
  }
  /**
   * This method looks up a class with its fully qualified name (FQN), or a short-name
   * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis."
   * as the package prefix (e.g. "standard.ClassicTokenizerFactory" ->
   * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory").
   *
   * If className contains a period, the class is first looked up as-is, assuming that it
   * is an FQN.  If this fails, lookup is retried after prepending the Lucene analysis
   * package prefix to the class name.
   *
   * If className does not contain a period, the analysis SPI *Factory.lookupClass()
   * methods are used to find the class.
   *
   * @param className The name or the short name of the class.
   * @param expectedType The superclass className is expected to extend
   * @return the loaded class.
   * @throws ClassNotFoundException if lookup fails
   */
  public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType)
      throws ClassNotFoundException {
    if (className.contains(".")) {
      try {
        // First, try className == FQN
        return Class.forName(className).asSubclass(expectedType);
      } catch (ClassNotFoundException e) {
        try {
          // Second, retry lookup after prepending the Lucene analysis package prefix
          return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType);
        } catch (ClassNotFoundException e1) {
          throw new ClassNotFoundException("Can't find class '" + className
                                           + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'");
        }
      }
    }
    // No dot - use analysis SPI lookup
    final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst("");
    if (CharFilterFactory.class.isAssignableFrom(expectedType)) {
      return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
    } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) {
      return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
    } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) {
      return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
    }
    throw new ClassNotFoundException("Can't find class '" + className + "'");
  }
  /* (non-Javadoc)
   * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams()
   */
  @Override
  public boolean supportsParams() {
    return true;
  }
  /** Returns the current line in the algorithm file */
  public int lineno(StreamTokenizer stok) {
    return getAlgLineNum() + stok.lineno();
  }
 }
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java
@ -16,10 +16,16 @@ package org.apache.lucene.benchmark.byTask.tasks;
 */
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.util.CharFilterFactory;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.TokenizerFactory;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory;
 import org.apache.lucene.util.Version;
 import java.io.IOException;
 import java.io.StreamTokenizer;
 import java.io.StringReader;
 import java.util.*;
 import java.lang.reflect.Constructor;
@ -28,12 +34,12 @@ import java.lang.reflect.Constructor;
 *
 */
 public class NewAnalyzerTask extends PerfTask {
-  private List<String> analyzerClassNames;
+  private List<String> analyzerNames;
  private int current;
  public NewAnalyzerTask(PerfRunData runData) {
    super(runData);
-    analyzerClassNames = new ArrayList<String>();
+    analyzerNames = new ArrayList<String>();
  }
  public static final Analyzer createAnalyzer(String className) throws Exception{
@ -50,55 +56,98 @@ public class NewAnalyzerTask extends PerfTask {
  @Override
  public int doLogic() throws IOException {
-    String className = null;
+    String analyzerName = null;
    try {
-      if (current >= analyzerClassNames.size()) {
+      if (current >= analyzerNames.size()) {
        current = 0;
      }
-      className = analyzerClassNames.get(current++);
+      analyzerName = analyzerNames.get(current++);
      Analyzer analyzer = null;
-      if (null == className || 0 == className.length()) {
+      if (null == analyzerName || 0 == analyzerName.length()) {
-        className = "org.apache.lucene.analysis.standard.StandardAnalyzer";
+        analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer";
      }
-      if (-1 == className.indexOf(".")) {
+      // First, lookup analyzerName as a named analyzer factory
-        try {
+      AnalyzerFactory factory = getRunData().getAnalyzerFactories().get(analyzerName);
-          // If no package, first attempt to instantiate a core analyzer
+      if (null != factory) {
-          String coreClassName = "org.apache.lucene.analysis.core." + className;
+        analyzer = factory.create();
          analyzer = createAnalyzer(coreClassName);
          className = coreClassName;
        } catch (ClassNotFoundException e) {
          // If not a core analyzer, try the base analysis package 
          className = "org.apache.lucene.analysis." + className;
          analyzer = createAnalyzer(className);
        }
      } else {
-        if (className.startsWith("standard.")) {
+        if (analyzerName.contains(".")) {
-          className = "org.apache.lucene.analysis." + className;
+          if (analyzerName.startsWith("standard.")) {
            analyzerName = "org.apache.lucene.analysis." + analyzerName;
          }
          analyzer = createAnalyzer(analyzerName);
        } else { // No package
          try {
            // Attempt to instantiate a core analyzer
            String coreClassName = "org.apache.lucene.analysis.core." + analyzerName;
            analyzer = createAnalyzer(coreClassName);
            analyzerName = coreClassName;
          } catch (ClassNotFoundException e) {
            // If not a core analyzer, try the base analysis package
            analyzerName = "org.apache.lucene.analysis." + analyzerName;
            analyzer = createAnalyzer(analyzerName);
          }
        }
        analyzer = createAnalyzer(className);
      }
      getRunData().setAnalyzer(analyzer);
      System.out.println("Changed Analyzer to: " + className);
    } catch (Exception e) {
-      throw new RuntimeException("Error creating Analyzer: " + className, e);
+      throw new RuntimeException("Error creating Analyzer: " + analyzerName, e);
    }
    return 1;
  }
  /**
-   * Set the params (analyzerClassName only),  Comma-separate list of Analyzer class names.  If the Analyzer lives in
+   * Set the params (analyzerName only),  Comma-separate list of Analyzer class names.  If the Analyzer lives in
   * org.apache.lucene.analysis, the name can be shortened by dropping the o.a.l.a part of the Fully Qualified Class Name.
   * <p/>
   * Analyzer names may also refer to previously defined AnalyzerFactory's.
   * <p/>
   * Example Declaration: {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) >
   * <p/>
   * Example AnalyzerFactory usage:
   * <pre>
   * -AnalyzerFactory(name:'whitespace tokenized',WhitespaceTokenizer)
   * -NewAnalyzer('whitespace tokenized')
   * </pre>
   * @param params analyzerClassName, or empty for the StandardAnalyzer
   */
  @Override
  public void setParams(String params) {
    super.setParams(params);
-    for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) {
+    final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
-      String s = tokenizer.nextToken();
+    stok.quoteChar('"');
-      analyzerClassNames.add(s.trim());
+    stok.quoteChar('\'');
    stok.eolIsSignificant(false);
    stok.ordinaryChar(',');
    try {
      while (stok.nextToken() != StreamTokenizer.TT_EOF) {
        switch (stok.ttype) {
          case ',': {
            // Do nothing
            break;
          }
          case '\'':
          case '\"':
          case StreamTokenizer.TT_WORD: {
            analyzerNames.add(stok.sval);
            break;
          }
          default: {
            throw new RuntimeException("Unexpected token: " + stok.toString());
          }
        }
      }
    } catch (RuntimeException e) {
      if (e.getMessage().startsWith("Line #")) {
        throw e;
      } else {
        throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", e);
      }
    } catch (Throwable t) {
      throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", t);
    }
  }
  /* (non-Javadoc)
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
@ -1,117 +0,0 @@
 package org.apache.lucene.benchmark.byTask.tasks;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.util.StringTokenizer;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
 import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 /**
 * Task to support benchmarking ShingleFilter / ShingleAnalyzerWrapper
 * <p>
 * <ul>
 *  <li> <code>NewShingleAnalyzer</code> (constructs with all defaults)
 *  <li> <code>NewShingleAnalyzer(analyzer:o.a.l.analysis.StandardAnalyzer,maxShingleSize:2,outputUnigrams:true)</code>
 * </ul>
 * </p>
 */
 public class NewShingleAnalyzerTask extends PerfTask {
  private String analyzerClassName = "standard.StandardAnalyzer";
  private int maxShingleSize = 2;
  private boolean outputUnigrams = true;
  public NewShingleAnalyzerTask(PerfRunData runData) {
    super(runData);
  }
  private void setAnalyzer() throws Exception {
    Analyzer wrappedAnalyzer = null;
    if (null == analyzerClassName || 0 == analyzerClassName.length()) {
      analyzerClassName = "org.apache.lucene.analysis.standard.StandardAnalyzer";
    } 
    if (-1 == analyzerClassName.indexOf(".")) {
      String coreClassName = "org.apache.lucene.analysis.core." + analyzerClassName;
      try {
        // If there is no package, first attempt to instantiate a core analyzer
        wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(coreClassName);
        analyzerClassName = coreClassName;
      } catch (ClassNotFoundException e) {
        // If this is not a core analyzer, try the base analysis package 
        analyzerClassName = "org.apache.lucene.analysis." + analyzerClassName;
        wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(analyzerClassName);
      }
    } else {    
      if (analyzerClassName.startsWith("standard.")) {
        analyzerClassName = "org.apache.lucene.analysis." + analyzerClassName;
      }
      wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(analyzerClassName);
    }
    ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
        wrappedAnalyzer,
        ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
        maxShingleSize,
        ShingleFilter.TOKEN_SEPARATOR,
        outputUnigrams,
        false);
    getRunData().setAnalyzer(analyzer);
  }
  @Override
  public int doLogic() throws Exception {
    try {
      setAnalyzer();
      System.out.println
        ("Changed Analyzer to: ShingleAnalyzerWrapper, wrapping ShingleFilter over " 
         + analyzerClassName);
    } catch (Exception e) {
      throw new RuntimeException("Error creating Analyzer", e);
    }
    return 1;
  }
  @Override
  public void setParams(String params) {
    super.setParams(params);
    StringTokenizer st = new StringTokenizer(params, ",");
    while (st.hasMoreTokens()) {
      String param = st.nextToken();
      StringTokenizer expr = new StringTokenizer(param, ":");
      String key = expr.nextToken();
      String value = expr.nextToken();
      if (key.equalsIgnoreCase("analyzer")) {
        analyzerClassName = value;
      } else if (key.equalsIgnoreCase("outputUnigrams")) {
        outputUnigrams = Boolean.parseBoolean(value);
      } else if (key.equalsIgnoreCase("maxShingleSize")) {
        maxShingleSize = (int)Double.parseDouble(value);
      } else {
        throw new RuntimeException("Unknown parameter " + param);
      }
    }
  }
  @Override
  public boolean supportsParams() {
    return true;
  }
 }
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
@ -62,6 +62,9 @@ public abstract class PerfTask implements Cloneable {
  private boolean runInBackground;
  private int deltaPri;
  // The first line of this task's definition in the alg file
  private int algLineNum = 0;
  protected static final String NEW_LINE = System.getProperty("line.separator");
  /** Should not be used externally */
@ -317,4 +320,11 @@ public abstract class PerfTask implements Cloneable {
    this.disableCounting = disableCounting;
  }
  public void setAlgLineNum(int algLineNum) {
    this.algLineNum = algLineNum;
  }
  public int getAlgLineNum() {
    return algLineNum;
  }
 }
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
@ -58,11 +58,12 @@ public class Algorithm {
    StreamTokenizer stok = new StreamTokenizer(new StringReader(algTxt));
    stok.commentChar('#');
    stok.eolIsSignificant(false);
-    stok.ordinaryChar('"');
+    stok.quoteChar('"');
    stok.quoteChar('\'');
    stok.ordinaryChar('/');
    stok.ordinaryChar('(');
    stok.ordinaryChar(')');
-    boolean colonOk = false; 
+    boolean colonOk = false;
    boolean isDisableCountNextTask = false; // only for primitive tasks
    currSequence.setDepth(0);
@ -74,6 +75,7 @@ public class Algorithm {
          Constructor<? extends PerfTask> cnstr = taskClass(config,s)
            .asSubclass(PerfTask.class).getConstructor(PerfRunData.class);
          PerfTask task = cnstr.newInstance(runData);
          task.setAlgLineNum(stok.lineno());
          task.setDisableCounting(isDisableCountNextTask);
          isDisableCountNextTask = false;
          currSequence.addTask(task);
@ -90,24 +92,54 @@ public class Algorithm {
          if (stok.ttype!='(') {
            stok.pushBack();
          } else {
-            // get params, for tasks that supports them, - anything until next ')'
+            // get params, for tasks that supports them - allow recursive parenthetical expressions
            stok.eolIsSignificant(true);  // Allow params tokenizer to keep track of line number
            StringBuilder params = new StringBuilder();
            stok.nextToken();
-            while (stok.ttype!=')') { 
+            if (stok.ttype != ')') {
-              switch (stok.ttype) {
+              int count = 1;
-                case StreamTokenizer.TT_NUMBER:  
+              BALANCED_PARENS: while (true) {
-                  params.append(stok.nval);
+                switch (stok.ttype) {
-                  break;
+                  case StreamTokenizer.TT_NUMBER: {
-                case StreamTokenizer.TT_WORD:    
+                    params.append(stok.nval);
-                  params.append(stok.sval);             
+                    break;
-                  break;
+                  }
-                case StreamTokenizer.TT_EOF:     
+                  case StreamTokenizer.TT_WORD: {
-                  throw new Exception("unexpexted EOF: - "+stok.toString());
+                    params.append(stok.sval);
-                default:
+                    break;
-                  params.append((char)stok.ttype);
+                  }
                  case StreamTokenizer.TT_EOF: {
                    throw new RuntimeException("Unexpexted EOF: - "+stok.toString());
                  }
                  case '"':
                  case '\'': {
                    params.append((char)stok.ttype);
                    // re-escape delimiters, if any
                    params.append(stok.sval.replaceAll("" + (char)stok.ttype, "\\\\" + (char)stok.ttype));
                    params.append((char)stok.ttype);
                    break;
                  }
                  case '(': {
                    params.append((char)stok.ttype);
                    ++count;
                    break;
                  }
                  case ')': {
                    if (--count >= 1) {  // exclude final closing parenthesis
                      params.append((char)stok.ttype);
                    } else {
                      break BALANCED_PARENS;
                    }
                    break;
                  }
                  default: {
                    params.append((char)stok.ttype);
                  }
                }
                stok.nextToken();
              }
              stok.nextToken();
            }
            stok.eolIsSignificant(false);
            String prm = params.toString().trim();
            if (prm.length()>0) {
              task.setParams(prm);
@ -182,10 +214,8 @@ public class Algorithm {
              if (stok.ttype!='"') {
                stok.pushBack();
              } else {
                stok.nextToken();
                name = stok.sval;
-                stok.nextToken();
+                if (stok.ttype!='"' || name==null || name.length()==0) {
                if (stok.ttype!='"' || name==null || name.length()==0) { 
                  throw new Exception("sequence name problem - "+stok.toString()); 
                }
              }
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java
@ -0,0 +1,132 @@
 package org.apache.lucene.benchmark.byTask.utils;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.util.CharFilterFactory;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.TokenizerFactory;
 import java.io.Reader;
 import java.util.List;
 /**
 * A factory to create an analyzer.
 * See {@link org.apache.lucene.benchmark.byTask.tasks.AnalyzerFactoryTask}
 */
 public final class AnalyzerFactory {
  final private List<CharFilterFactory> charFilterFactories;
  final private TokenizerFactory tokenizerFactory;
  final private List<TokenFilterFactory> tokenFilterFactories;
  private String name = null;
  private Integer positionIncrementGap = null;
  private Integer offsetGap = null;
  public AnalyzerFactory(List<CharFilterFactory> charFilterFactories,
                         TokenizerFactory tokenizerFactory,
                         List<TokenFilterFactory> tokenFilterFactories) {
    this.charFilterFactories = charFilterFactories;
    assert null != tokenizerFactory;
    this.tokenizerFactory = tokenizerFactory;
    this.tokenFilterFactories = tokenFilterFactories;
  }
  public void setName(String name) {
    this.name = name;
  }
  public void setPositionIncrementGap(Integer positionIncrementGap) {
    this.positionIncrementGap = positionIncrementGap;
  }
  public void setOffsetGap(Integer offsetGap) {
    this.offsetGap = offsetGap;
  }
  public Analyzer create() {
    return new Analyzer() {
      private final Integer positionIncrementGap = AnalyzerFactory.this.positionIncrementGap;
      private final Integer offsetGap = AnalyzerFactory.this.offsetGap;
      @Override
      public Reader initReader(String fieldName, Reader reader) {
        if (charFilterFactories != null && charFilterFactories.size() > 0) {
          Reader wrappedReader = reader;
          for (CharFilterFactory charFilterFactory : charFilterFactories) {
            wrappedReader = charFilterFactory.create(wrappedReader);
          }
          reader = wrappedReader;
        }
        return reader;
      }
      @Override
      protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) {
        final Tokenizer tokenizer = tokenizerFactory.create(reader);
        TokenStream tokenStream = tokenizer;
        for (TokenFilterFactory filterFactory : tokenFilterFactories) {
          tokenStream = filterFactory.create(tokenStream);
        }
        return new TokenStreamComponents(tokenizer, tokenStream);
      }
      @Override
      public int getPositionIncrementGap(String fieldName) {
        return null == positionIncrementGap ? super.getPositionIncrementGap(fieldName) : positionIncrementGap;
      }
      @Override
      public int getOffsetGap(String fieldName) {
        return null == offsetGap ? super.getOffsetGap(fieldName) : offsetGap;
      }
    };
  }
  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder("AnalyzerFactory(");
    if (null != name) {
      sb.append("name:");
      sb.append(name);
      sb.append(", ");
    }
    if (null != positionIncrementGap) {
      sb.append("positionIncrementGap:");
      sb.append(positionIncrementGap);
      sb.append(", ");
    }
    if (null != offsetGap) {
      sb.append("offsetGap:");
      sb.append(offsetGap);
      sb.append(", ");
    }
    for (CharFilterFactory charFilterFactory: charFilterFactories) {
      sb.append(charFilterFactory);
      sb.append(", ");
    }
    sb.append(tokenizerFactory);
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      sb.append(", ");
      sb.append(tokenFilterFactory);
    }
    sb.append(')');
    return sb.toString();
  }
 }
--- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
+++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
@ -71,6 +71,7 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
  public void setUp() throws Exception {
    super.setUp();
    copyToWorkDir("reuters.first20.lines.txt");
    copyToWorkDir("test-mapping-ISOLatin1Accent-partial.txt");
  }
  /**
@ -1020,63 +1021,79 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
  }
  /**
-   * Test that we can create ShingleAnalyzerWrappers.
+   * Test that we can create shingle analyzers using AnalyzerFactory.
   */
  public void testShingleAnalyzer() throws Exception {
    String text = "one,two,three, four five six";
-    // Default analyzer, maxShingleSize, and outputUnigrams
+    // StandardTokenizer, maxShingleSize, and outputUnigrams
-    Benchmark benchmark = execBenchmark(getShingleConfig(""));
+    Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
        ("shingle-analyzer", "StandardTokenizer,ShingleFilter"));
    benchmark.getRunData().getAnalyzer().tokenStream
        ("bogus", new StringReader(text)).close();
-    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+    BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
-                       new String[] {"one", "one two", "two", "two three",
+                                             new String[] { "one", "one two", "two", "two three",
-                                     "three", "three four", "four", "four five",
+                                                            "three", "three four", "four", "four five",
-                                     "five", "five six", "six"});
+                                                            "five", "five six", "six" });
-    // Default analyzer, maxShingleSize = 3, and outputUnigrams = false
+    // StandardTokenizer, maxShingleSize = 3, and outputUnigrams = false
    benchmark = execBenchmark
-      (getShingleConfig("maxShingleSize:3,outputUnigrams:false"));
+      (getAnalyzerFactoryConfig
-    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+          ("shingle-analyzer",
-                       new String[] { "one two", "one two three", "two three",
+           "StandardTokenizer,ShingleFilter(maxShingleSize:3,outputUnigrams:false)"));
-                                      "two three four", "three four", 
+    BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
-                                      "three four five", "four five",
+                                             new String[] { "one two", "one two three", "two three",
-                                      "four five six", "five six" });
+                                                            "two three four", "three four",
-    // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
+                                                            "three four five", "four five",
                                                            "four five six", "five six" });
    // WhitespaceTokenizer, default maxShingleSize and outputUnigrams
    benchmark = execBenchmark
-      (getShingleConfig("analyzer:WhitespaceAnalyzer"));
+      (getAnalyzerFactoryConfig("shingle-analyzer", "WhitespaceTokenizer,ShingleFilter"));
-    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+    BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
-                       new String[] { "one,two,three,", "one,two,three, four",
+                                             new String[] { "one,two,three,", "one,two,three, four",
-                                      "four", "four five", "five", "five six", 
+                                                            "four", "four five", "five", "five six",
-                                      "six" });
+                                                            "six" });
-    // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
+    // WhitespaceTokenizer, maxShingleSize=3 and outputUnigrams=false
    benchmark = execBenchmark
-      (getShingleConfig
+      (getAnalyzerFactoryConfig
-        ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
+        ("shingle-factory",
-    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+         "WhitespaceTokenizer,ShingleFilter(outputUnigrams:false,maxShingleSize:3)"));
-                       new String[] { "one,two,three, four", 
+    BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
-                                      "one,two,three, four five",
+                                             new String[] { "one,two,three, four",
-                                      "four five", "four five six",
+                                                            "one,two,three, four five",
-                                      "five six" });
+                                                            "four five", "four five six",
                                                            "five six" });
  }
-  private void assertEqualShingle
+  private String[] getAnalyzerFactoryConfig(String name, String params) {
-    (Analyzer analyzer, String text, String[] expected) throws Exception {
+    final String singleQuoteEscapedName = name.replaceAll("'", "\\\\'");
    BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, text, expected);
  }
  private String[] getShingleConfig(String params) { 
    String algLines[] = {
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "work.dir=" + getWorkDir().getAbsolutePath(),
        "content.source.forever=false",
        "directory=RAMDirectory",
-        "NewShingleAnalyzer(" + params + ")",
+        "AnalyzerFactory(name:'" + singleQuoteEscapedName + "', " + params + ")",
        "NewAnalyzer('" + singleQuoteEscapedName + "')",
        "CreateIndex",
        "{ \"AddDocs\"  AddDoc > : * "
    };
    return algLines;
  }
  public void testAnalyzerFactory() throws Exception {
    String text = "Fortieth, Quarantième, Cuadragésimo";
    Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
        ("ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'",
         "positionIncrementGap:100,offsetGap:1111,"
         +"MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt'),"
         +"PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\"),"
         +"StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)"));
    BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
        new String[] { "fo", "or", "rt", "ti", "ie", "et", "th",
                       "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix", "xx", "xx", "xe",
                       "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs", "si", "io"});
  }
  private String getReuters20LinesFile() {
    return getWorkDirResourcePath("reuters.first20.lines.txt");
--- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt
+++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt
@ -0,0 +1,30 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Syntax:
 #   "source" => "target"
 #     "source".length() > 0 (source cannot be empty.)
 #     "target".length() >= 0 (target can be empty.)
 # example:
 #   "À" => "A"
 #   "\u00C0" => "A"
 #   "\u00C0" => "\u0041"
 #   "ß" => "ss"
 #   "\t" => " "
 #   "\n" => ""
 # è => e
 "\u00E8" => "e"
 # é => e
 "\u00E9" => "e"