diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b36d3972c89..0bbf862d887 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -79,6 +79,9 @@ New Features near-real-time reader is opened that contains those changes. (Robert Muir, Mike McCandless) +* LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable analyzer + creation via the resulting factories using NewAnalyzerTask. (Steve Rowe) + API Changes * LUCENE-4709: FacetResultNode no longer has a residue field. (Shai Erera) diff --git a/lucene/benchmark/conf/shingle.alg b/lucene/benchmark/conf/shingle.alg index 5fb68763c38..b0744341c76 100644 --- a/lucene/benchmark/conf/shingle.alg +++ b/lucene/benchmark/conf/shingle.alg @@ -19,25 +19,43 @@ doc.body.tokenized=true docs.dir=reuters-out log.step=1000 +-AnalyzerFactory(name:shingle-bigrams-unigrams, + StandardTokenizer, + ShingleFilter(maxShingleSize:2, outputUnigrams:true)) + +-AnalyzerFactory(name:shingle-bigrams, + StandardTokenizer, + ShingleFilter(maxShingleSize:2, outputUnigrams:false)) + +-AnalyzerFactory(name:shingle-4grams-unigrams, + StandardTokenizer, + ShingleFilter(maxShingleSize:4, outputUnigrams:true)) + +-AnalyzerFactory(name:shingle-4grams, + StandardTokenizer, + ShingleFilter(maxShingleSize:4, outputUnigrams:false)) + +-AnalyzerFactory(name:standard-tokenizer-only, StandardTokenizer) + { "Rounds" - -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:true) + -NewAnalyzer(shingle-bigrams-unigrams) -ResetInputs { "BigramsAndUnigrams" { ReadTokens > : 10000 } - -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:false) + -NewAnalyzer(shingle-bigrams) -ResetInputs { "BigramsOnly" { ReadTokens > : 10000 } - -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:true) + -NewAnalyzer(shingle-4grams-unigrams) -ResetInputs { "FourgramsAndUnigrams" { ReadTokens > : 10000 } - -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:false) + -NewAnalyzer(shingle-4grams) -ResetInputs { "FourgramsOnly" { ReadTokens > : 10000 } - -NewAnalyzer(standard.StandardAnalyzer) + -NewAnalyzer(standard-tokenizer-only) -ResetInputs { "UnigramsOnly" { ReadTokens > : 10000 } diff --git a/lucene/benchmark/scripts/shingle.bm2jira.pl b/lucene/benchmark/scripts/shingle.bm2jira.pl index ce6d1936f67..728dc9b8045 100644 --- a/lucene/benchmark/scripts/shingle.bm2jira.pl +++ b/lucene/benchmark/scripts/shingle.bm2jira.pl @@ -51,7 +51,7 @@ while (<>) { # Print out platform info print "JAVA:\n", `java -version 2>&1`, "\nOS:\n"; -if ($^O =~ /win/i) { +if ($^O =~ /(?Directory, Writer, Reader. *
  • Taxonomy Directory, Writer, Reader. *
  • DocMaker, FacetSource and a few instances of QueryMaker. + *
  • Named AnalysisFactories. *
  • Analyzer. *
  • Statistics data which updated during the run. * @@ -78,6 +81,7 @@ public class PerfRunData implements Closeable { // directory, analyzer, docMaker - created at startup. // reader, writer, searcher - maintained by basic tasks. private Directory directory; + private Map analyzerFactories = new HashMap(); private Analyzer analyzer; private DocMaker docMaker; private ContentSource contentSource; @@ -358,7 +362,7 @@ public class PerfRunData implements Closeable { } /** - * @return Returns the anlyzer. + * @return Returns the analyzer. */ public Analyzer getAnalyzer() { return analyzer; @@ -434,4 +438,7 @@ public class PerfRunData implements Closeable { return qm; } + public Map getAnalyzerFactories() { + return analyzerFactories; + } } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java new file mode 100644 index 00000000000..d69fe64a413 --- /dev/null +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java @@ -0,0 +1,459 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.FilesystemResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory; +import org.apache.lucene.util.Version; + +import java.io.File; +import java.io.StreamTokenizer; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + +/** + * Analyzer factory construction task. The name given to the constructed factory may + * be given to NewAnalyzerTask, which will call AnalyzerFactory.create(). + * + * Params are in the form argname:argvalue or argname:"argvalue" or argname:'argvalue'; + * use backslashes to escape '"' or "'" inside a quoted value when it's used as the enclosing + * quotation mark, + * + * Specify params in a comma separated list of the following, in order: + *
      + *
    1. Analyzer args: + *
        + *
      • Required: name:analyzer-factory-name
      • + *
      • Optional: positionIncrementGap:int value (default: 0)
      • + *
      • Optional: offsetGap:int value (default: 1)
      • + *
      + *
    2. + *
    3. zero or more CharFilterFactory's, followed by
    4. + *
    5. exactly one TokenizerFactory, followed by
    6. + *
    7. zero or more TokenFilterFactory's
    8. + *
    + * + * Each component analysis factory map specify luceneMatchVersion (defaults to + * {@link Version#LUCENE_CURRENT}) and any of the args understood by the specified + * *Factory class, in the above-describe param format. + *

    + * Example: + *

    + *     -AnalyzerFactory(name:'strip html, fold to ascii, whitespace tokenize, max 10k tokens',
    + *                      positionIncrementGap:100,
    + *                      HTMLStripCharFilter,
    + *                      MappingCharFilter(mapping:'mapping-FoldToASCII.txt'),
    + *                      WhitespaceTokenizer(luceneMatchVersion:LUCENE_42),
    + *                      TokenLimitFilter(maxTokenCount:10000, consumeAllTokens:false))
    + *     [...]
    + *     -NewAnalyzer('strip html, fold to ascii, whitespace tokenize, max 10k tokens')
    + * 
    + *

    + * AnalyzerFactory will direct analysis component factories to look for resources + * under the directory specified in the "work.dir" property. + */ +public class AnalyzerFactoryTask extends PerfTask { + private static final String LUCENE_ANALYSIS_PACKAGE_PREFIX = "org.apache.lucene.analysis."; + private static final Pattern ANALYSIS_COMPONENT_SUFFIX_PATTERN + = Pattern.compile("(?s:(?:(?:Token|Char)?Filter|Tokenizer)(?:Factory)?)$"); + private static final Pattern TRAILING_DOT_ZERO_PATTERN = Pattern.compile("\\.0$"); + + private enum ArgType {ANALYZER_ARG, ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER, TOKENFILTER } + + String factoryName = null; + Integer positionIncrementGap = null; + Integer offsetGap = null; + private List charFilterFactories = new ArrayList(); + private TokenizerFactory tokenizerFactory = null; + private List tokenFilterFactories = new ArrayList(); + + public AnalyzerFactoryTask(PerfRunData runData) { + super(runData); + } + + @Override + public int doLogic() { + return 1; + } + + /** + * Sets the params. + * Analysis component factory names may optionally include the "Factory" suffix. + * + * @param params analysis pipeline specification: name, (optional) positionIncrementGap, + * (optional) offsetGap, 0+ CharFilterFactory's, 1 TokenizerFactory, + * and 0+ TokenFilterFactory's + */ + @Override + public void setParams(String params) { + super.setParams(params); + ArgType expectedArgType = ArgType.ANALYZER_ARG; + + final StreamTokenizer stok = new StreamTokenizer(new StringReader(params)); + stok.commentChar('#'); + stok.quoteChar('"'); + stok.quoteChar('\''); + stok.eolIsSignificant(false); + stok.ordinaryChar('('); + stok.ordinaryChar(')'); + stok.ordinaryChar(':'); + stok.ordinaryChar(','); + try { + while (stok.nextToken() != StreamTokenizer.TT_EOF) { + switch (stok.ttype) { + case ',': { + // Do nothing + break; + } + case StreamTokenizer.TT_WORD: { + if (expectedArgType.equals(ArgType.ANALYZER_ARG)) { + final String argName = stok.sval; + if ( ! argName.equalsIgnoreCase("name") + && ! argName.equalsIgnoreCase("positionIncrementGap") + && ! argName.equalsIgnoreCase("offsetGap")) { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Missing 'name' param to AnalyzerFactory: '" + params + "'"); + } + stok.nextToken(); + if (stok.ttype != ':') { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory"); + } + + stok.nextToken(); + String argValue = stok.sval; + switch (stok.ttype) { + case StreamTokenizer.TT_NUMBER: { + argValue = Double.toString(stok.nval); + // Drop the ".0" from numbers, for integer arguments + argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst(""); + // Intentional fallthrough + } + case '"': + case '\'': + case StreamTokenizer.TT_WORD: { + if (argName.equalsIgnoreCase("name")) { + factoryName = argValue; + expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER; + } else { + int intArgValue = 0; + try { + intArgValue = Integer.parseInt(argValue); + } catch (NumberFormatException e) { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + argValue + "'", e); + } + if (argName.equalsIgnoreCase("positionIncrementGap")) { + positionIncrementGap = intArgValue; + } else if (argName.equalsIgnoreCase("offsetGap")) { + offsetGap = intArgValue; + } + } + break; + } + case StreamTokenizer.TT_EOF: { + throw new RuntimeException("Unexpected EOF: " + stok.toString()); + } + default: { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); + } + } + } else if (expectedArgType.equals(ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER)) { + final String argName = stok.sval; + + if (argName.equalsIgnoreCase("positionIncrementGap") + || argName.equalsIgnoreCase("offsetGap")) { + stok.nextToken(); + if (stok.ttype != ':') { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory"); + } + stok.nextToken(); + int intArgValue = (int)stok.nval; + switch (stok.ttype) { + case '"': + case '\'': + case StreamTokenizer.TT_WORD: { + intArgValue = 0; + try { + intArgValue = Integer.parseInt(stok.sval.trim()); + } catch (NumberFormatException e) { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + stok.sval + "'", e); + } + // Intentional fall-through + } + case StreamTokenizer.TT_NUMBER: { + if (argName.equalsIgnoreCase("positionIncrementGap")) { + positionIncrementGap = intArgValue; + } else if (argName.equalsIgnoreCase("offsetGap")) { + offsetGap = intArgValue; + } + break; + } + case StreamTokenizer.TT_EOF: { + throw new RuntimeException("Unexpected EOF: " + stok.toString()); + } + default: { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); + } + } + break; + } + try { + final Class clazz; + clazz = lookupAnalysisClass(argName, CharFilterFactory.class); + createAnalysisPipelineComponent(stok, clazz); + } catch (IllegalArgumentException e) { + try { + final Class clazz; + clazz = lookupAnalysisClass(argName, TokenizerFactory.class); + createAnalysisPipelineComponent(stok, clazz); + expectedArgType = ArgType.TOKENFILTER; + } catch (IllegalArgumentException e2) { + throw new RuntimeException("Line #" + lineno(stok) + ": Can't find class '" + + argName + "' as CharFilterFactory or TokenizerFactory"); + } + } + } else { // expectedArgType = ArgType.TOKENFILTER + final String className = stok.sval; + final Class clazz; + try { + clazz = lookupAnalysisClass(className, TokenFilterFactory.class); + } catch (IllegalArgumentException e) { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Can't find class '" + className + "' as TokenFilterFactory"); + } + createAnalysisPipelineComponent(stok, clazz); + } + break; + } + default: { + throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); + } + } + } + } catch (RuntimeException e) { + if (e.getMessage().startsWith("Line #")) { + throw e; + } else { + throw new RuntimeException("Line #" + lineno(stok) + ": ", e); + } + } catch (Throwable t) { + throw new RuntimeException("Line #" + lineno(stok) + ": ", t); + } + + final AnalyzerFactory analyzerFactory = new AnalyzerFactory + (charFilterFactories, tokenizerFactory, tokenFilterFactories); + analyzerFactory.setPositionIncrementGap(positionIncrementGap); + analyzerFactory.setOffsetGap(offsetGap); + getRunData().getAnalyzerFactories().put(factoryName, analyzerFactory); + } + + /** + * Instantiates the given analysis factory class after pulling params from + * the given stream tokenizer, then stores the result in the appropriate + * pipeline component list. + * + * @param stok stream tokenizer from which to draw analysis factory params + * @param clazz analysis factory class to instantiate + */ + private void createAnalysisPipelineComponent + (StreamTokenizer stok, Class clazz) { + final AbstractAnalysisFactory instance; + try { + instance = clazz.newInstance(); + } catch (Exception e) { + throw new RuntimeException("Line #" + lineno(stok) + ": ", e); + } + Version luceneMatchVersion = null; + Map argMap = new HashMap(); + boolean parenthetical = false; + try { + WHILE_LOOP: while (stok.nextToken() != StreamTokenizer.TT_EOF) { + switch (stok.ttype) { + case ',': { + if (parenthetical) { + // Do nothing + break; + } else { + // Finished reading this analysis factory configuration + break WHILE_LOOP; + } + } + case '(': { + if (parenthetical) { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Unexpected opening parenthesis."); + } + parenthetical = true; + break; + } + case ')': { + if (parenthetical) { + parenthetical = false; + } else { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Unexpected closing parenthesis."); + } + break; + } + case StreamTokenizer.TT_WORD: { + if ( ! parenthetical) { + throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token '" + stok.sval + "'"); + } + String argName = stok.sval; + stok.nextToken(); + if (stok.ttype != ':') { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to " + clazz.getSimpleName()); + } + stok.nextToken(); + String argValue = stok.sval; + switch (stok.ttype) { + case StreamTokenizer.TT_NUMBER: { + argValue = Double.toString(stok.nval); + // Drop the ".0" from numbers, for integer arguments + argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst(""); + // Intentional fall-through + } + case '"': + case '\'': + case StreamTokenizer.TT_WORD: { + if (argName.equalsIgnoreCase("luceneMatchVersion")) { + try { + luceneMatchVersion = Version.parseLeniently(argValue); + } catch (IllegalArgumentException e) { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Unrecognized luceneMatchVersion '" + argValue + "'", e); + } + } else { + argMap.put(argName, argValue); + } + break; + } + case StreamTokenizer.TT_EOF: { + throw new RuntimeException("Unexpected EOF: " + stok.toString()); + } + default: { + throw new RuntimeException + ("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString()); + } + } + } + } + } + + instance.setLuceneMatchVersion + (null == luceneMatchVersion ? Version.LUCENE_CURRENT : luceneMatchVersion); + instance.init(argMap); + if (instance instanceof ResourceLoaderAware) { + File baseDir = new File(getRunData().getConfig().get("work.dir", "work")).getAbsoluteFile(); + ((ResourceLoaderAware)instance).inform(new FilesystemResourceLoader(baseDir)); + } + if (CharFilterFactory.class.isAssignableFrom(clazz)) { + charFilterFactories.add((CharFilterFactory)instance); + } else if (TokenizerFactory.class.isAssignableFrom(clazz)) { + tokenizerFactory = (TokenizerFactory)instance; + } else if (TokenFilterFactory.class.isAssignableFrom(clazz)) { + tokenFilterFactories.add((TokenFilterFactory)instance); + } + } catch (RuntimeException e) { + if (e.getMessage().startsWith("Line #")) { + throw (e); + } else { + throw new RuntimeException("Line #" + lineno(stok) + ": ", e); + } + } catch (Throwable t) { + throw new RuntimeException("Line #" + lineno(stok) + ": ", t); + } + } + + /** + * This method looks up a class with its fully qualified name (FQN), or a short-name + * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis." + * as the package prefix (e.g. "standard.ClassicTokenizerFactory" -> + * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory"). + * + * If className contains a period, the class is first looked up as-is, assuming that it + * is an FQN. If this fails, lookup is retried after prepending the Lucene analysis + * package prefix to the class name. + * + * If className does not contain a period, the analysis SPI *Factory.lookupClass() + * methods are used to find the class. + * + * @param className The name or the short name of the class. + * @param expectedType The superclass className is expected to extend + * @return the loaded class. + * @throws ClassNotFoundException if lookup fails + */ + public Class lookupAnalysisClass(String className, Class expectedType) + throws ClassNotFoundException { + if (className.contains(".")) { + try { + // First, try className == FQN + return Class.forName(className).asSubclass(expectedType); + } catch (ClassNotFoundException e) { + try { + // Second, retry lookup after prepending the Lucene analysis package prefix + return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType); + } catch (ClassNotFoundException e1) { + throw new ClassNotFoundException("Can't find class '" + className + + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'"); + } + } + } + // No dot - use analysis SPI lookup + final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst(""); + if (CharFilterFactory.class.isAssignableFrom(expectedType)) { + return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); + } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) { + return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType); + } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) { + return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); + } + + throw new ClassNotFoundException("Can't find class '" + className + "'"); + } + + + /* (non-Javadoc) + * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() + */ + @Override + public boolean supportsParams() { + return true; + } + + /** Returns the current line in the algorithm file */ + public int lineno(StreamTokenizer stok) { + return getAlgLineNum() + stok.lineno(); + } +} diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java index 8ce123e160f..08543944b20 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java @@ -16,10 +16,16 @@ package org.apache.lucene.benchmark.byTask.tasks; */ import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory; import org.apache.lucene.util.Version; import java.io.IOException; +import java.io.StreamTokenizer; +import java.io.StringReader; import java.util.*; import java.lang.reflect.Constructor; @@ -28,12 +34,12 @@ import java.lang.reflect.Constructor; * */ public class NewAnalyzerTask extends PerfTask { - private List analyzerClassNames; + private List analyzerNames; private int current; public NewAnalyzerTask(PerfRunData runData) { super(runData); - analyzerClassNames = new ArrayList(); + analyzerNames = new ArrayList(); } public static final Analyzer createAnalyzer(String className) throws Exception{ @@ -50,55 +56,98 @@ public class NewAnalyzerTask extends PerfTask { @Override public int doLogic() throws IOException { - String className = null; + String analyzerName = null; try { - if (current >= analyzerClassNames.size()) { + if (current >= analyzerNames.size()) { current = 0; } - className = analyzerClassNames.get(current++); + analyzerName = analyzerNames.get(current++); Analyzer analyzer = null; - if (null == className || 0 == className.length()) { - className = "org.apache.lucene.analysis.standard.StandardAnalyzer"; + if (null == analyzerName || 0 == analyzerName.length()) { + analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer"; } - if (-1 == className.indexOf(".")) { - try { - // If no package, first attempt to instantiate a core analyzer - String coreClassName = "org.apache.lucene.analysis.core." + className; - analyzer = createAnalyzer(coreClassName); - className = coreClassName; - } catch (ClassNotFoundException e) { - // If not a core analyzer, try the base analysis package - className = "org.apache.lucene.analysis." + className; - analyzer = createAnalyzer(className); - } + // First, lookup analyzerName as a named analyzer factory + AnalyzerFactory factory = getRunData().getAnalyzerFactories().get(analyzerName); + if (null != factory) { + analyzer = factory.create(); } else { - if (className.startsWith("standard.")) { - className = "org.apache.lucene.analysis." + className; + if (analyzerName.contains(".")) { + if (analyzerName.startsWith("standard.")) { + analyzerName = "org.apache.lucene.analysis." + analyzerName; + } + analyzer = createAnalyzer(analyzerName); + } else { // No package + try { + // Attempt to instantiate a core analyzer + String coreClassName = "org.apache.lucene.analysis.core." + analyzerName; + analyzer = createAnalyzer(coreClassName); + analyzerName = coreClassName; + } catch (ClassNotFoundException e) { + // If not a core analyzer, try the base analysis package + analyzerName = "org.apache.lucene.analysis." + analyzerName; + analyzer = createAnalyzer(analyzerName); + } } - analyzer = createAnalyzer(className); } getRunData().setAnalyzer(analyzer); - System.out.println("Changed Analyzer to: " + className); } catch (Exception e) { - throw new RuntimeException("Error creating Analyzer: " + className, e); + throw new RuntimeException("Error creating Analyzer: " + analyzerName, e); } return 1; } /** - * Set the params (analyzerClassName only), Comma-separate list of Analyzer class names. If the Analyzer lives in + * Set the params (analyzerName only), Comma-separate list of Analyzer class names. If the Analyzer lives in * org.apache.lucene.analysis, the name can be shortened by dropping the o.a.l.a part of the Fully Qualified Class Name. *

    + * Analyzer names may also refer to previously defined AnalyzerFactory's. + *

    * Example Declaration: {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) > + *

    + * Example AnalyzerFactory usage: + *

    +   * -AnalyzerFactory(name:'whitespace tokenized',WhitespaceTokenizer)
    +   * -NewAnalyzer('whitespace tokenized')
    +   * 
    * @param params analyzerClassName, or empty for the StandardAnalyzer */ @Override public void setParams(String params) { super.setParams(params); - for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) { - String s = tokenizer.nextToken(); - analyzerClassNames.add(s.trim()); + final StreamTokenizer stok = new StreamTokenizer(new StringReader(params)); + stok.quoteChar('"'); + stok.quoteChar('\''); + stok.eolIsSignificant(false); + stok.ordinaryChar(','); + try { + while (stok.nextToken() != StreamTokenizer.TT_EOF) { + switch (stok.ttype) { + case ',': { + // Do nothing + break; + } + case '\'': + case '\"': + case StreamTokenizer.TT_WORD: { + analyzerNames.add(stok.sval); + break; + } + default: { + throw new RuntimeException("Unexpected token: " + stok.toString()); + } + } + } + } catch (RuntimeException e) { + if (e.getMessage().startsWith("Line #")) { + throw e; + } else { + throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", e); + } + } catch (Throwable t) { + throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", t); } + + } /* (non-Javadoc) diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java deleted file mode 100644 index 3d42b18cb99..00000000000 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java +++ /dev/null @@ -1,117 +0,0 @@ -package org.apache.lucene.benchmark.byTask.tasks; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.StringTokenizer; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper; -import org.apache.lucene.analysis.shingle.ShingleFilter; -import org.apache.lucene.benchmark.byTask.PerfRunData; - -/** - * Task to support benchmarking ShingleFilter / ShingleAnalyzerWrapper - *

    - *

      - *
    • NewShingleAnalyzer (constructs with all defaults) - *
    • NewShingleAnalyzer(analyzer:o.a.l.analysis.StandardAnalyzer,maxShingleSize:2,outputUnigrams:true) - *
    - *

    - */ -public class NewShingleAnalyzerTask extends PerfTask { - - private String analyzerClassName = "standard.StandardAnalyzer"; - private int maxShingleSize = 2; - private boolean outputUnigrams = true; - - public NewShingleAnalyzerTask(PerfRunData runData) { - super(runData); - } - - private void setAnalyzer() throws Exception { - Analyzer wrappedAnalyzer = null; - if (null == analyzerClassName || 0 == analyzerClassName.length()) { - analyzerClassName = "org.apache.lucene.analysis.standard.StandardAnalyzer"; - } - if (-1 == analyzerClassName.indexOf(".")) { - String coreClassName = "org.apache.lucene.analysis.core." + analyzerClassName; - try { - // If there is no package, first attempt to instantiate a core analyzer - wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(coreClassName); - analyzerClassName = coreClassName; - } catch (ClassNotFoundException e) { - // If this is not a core analyzer, try the base analysis package - analyzerClassName = "org.apache.lucene.analysis." + analyzerClassName; - wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(analyzerClassName); - } - } else { - if (analyzerClassName.startsWith("standard.")) { - analyzerClassName = "org.apache.lucene.analysis." + analyzerClassName; - } - wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(analyzerClassName); - } - - ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( - wrappedAnalyzer, - ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, - maxShingleSize, - ShingleFilter.TOKEN_SEPARATOR, - outputUnigrams, - false); - getRunData().setAnalyzer(analyzer); - } - - @Override - public int doLogic() throws Exception { - try { - setAnalyzer(); - System.out.println - ("Changed Analyzer to: ShingleAnalyzerWrapper, wrapping ShingleFilter over " - + analyzerClassName); - } catch (Exception e) { - throw new RuntimeException("Error creating Analyzer", e); - } - return 1; - } - - @Override - public void setParams(String params) { - super.setParams(params); - StringTokenizer st = new StringTokenizer(params, ","); - while (st.hasMoreTokens()) { - String param = st.nextToken(); - StringTokenizer expr = new StringTokenizer(param, ":"); - String key = expr.nextToken(); - String value = expr.nextToken(); - if (key.equalsIgnoreCase("analyzer")) { - analyzerClassName = value; - } else if (key.equalsIgnoreCase("outputUnigrams")) { - outputUnigrams = Boolean.parseBoolean(value); - } else if (key.equalsIgnoreCase("maxShingleSize")) { - maxShingleSize = (int)Double.parseDouble(value); - } else { - throw new RuntimeException("Unknown parameter " + param); - } - } - } - - @Override - public boolean supportsParams() { - return true; - } -} diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java index 4af1d4df06d..d7a39ffd01b 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java @@ -62,6 +62,9 @@ public abstract class PerfTask implements Cloneable { private boolean runInBackground; private int deltaPri; + // The first line of this task's definition in the alg file + private int algLineNum = 0; + protected static final String NEW_LINE = System.getProperty("line.separator"); /** Should not be used externally */ @@ -317,4 +320,11 @@ public abstract class PerfTask implements Cloneable { this.disableCounting = disableCounting; } + public void setAlgLineNum(int algLineNum) { + this.algLineNum = algLineNum; + } + + public int getAlgLineNum() { + return algLineNum; + } } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java index fec744e10ca..ef9092c4561 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java @@ -58,11 +58,12 @@ public class Algorithm { StreamTokenizer stok = new StreamTokenizer(new StringReader(algTxt)); stok.commentChar('#'); stok.eolIsSignificant(false); - stok.ordinaryChar('"'); + stok.quoteChar('"'); + stok.quoteChar('\''); stok.ordinaryChar('/'); stok.ordinaryChar('('); stok.ordinaryChar(')'); - boolean colonOk = false; + boolean colonOk = false; boolean isDisableCountNextTask = false; // only for primitive tasks currSequence.setDepth(0); @@ -74,6 +75,7 @@ public class Algorithm { Constructor cnstr = taskClass(config,s) .asSubclass(PerfTask.class).getConstructor(PerfRunData.class); PerfTask task = cnstr.newInstance(runData); + task.setAlgLineNum(stok.lineno()); task.setDisableCounting(isDisableCountNextTask); isDisableCountNextTask = false; currSequence.addTask(task); @@ -90,24 +92,54 @@ public class Algorithm { if (stok.ttype!='(') { stok.pushBack(); } else { - // get params, for tasks that supports them, - anything until next ')' + // get params, for tasks that supports them - allow recursive parenthetical expressions + stok.eolIsSignificant(true); // Allow params tokenizer to keep track of line number StringBuilder params = new StringBuilder(); stok.nextToken(); - while (stok.ttype!=')') { - switch (stok.ttype) { - case StreamTokenizer.TT_NUMBER: - params.append(stok.nval); - break; - case StreamTokenizer.TT_WORD: - params.append(stok.sval); - break; - case StreamTokenizer.TT_EOF: - throw new Exception("unexpexted EOF: - "+stok.toString()); - default: - params.append((char)stok.ttype); + if (stok.ttype != ')') { + int count = 1; + BALANCED_PARENS: while (true) { + switch (stok.ttype) { + case StreamTokenizer.TT_NUMBER: { + params.append(stok.nval); + break; + } + case StreamTokenizer.TT_WORD: { + params.append(stok.sval); + break; + } + case StreamTokenizer.TT_EOF: { + throw new RuntimeException("Unexpexted EOF: - "+stok.toString()); + } + case '"': + case '\'': { + params.append((char)stok.ttype); + // re-escape delimiters, if any + params.append(stok.sval.replaceAll("" + (char)stok.ttype, "\\\\" + (char)stok.ttype)); + params.append((char)stok.ttype); + break; + } + case '(': { + params.append((char)stok.ttype); + ++count; + break; + } + case ')': { + if (--count >= 1) { // exclude final closing parenthesis + params.append((char)stok.ttype); + } else { + break BALANCED_PARENS; + } + break; + } + default: { + params.append((char)stok.ttype); + } + } + stok.nextToken(); } - stok.nextToken(); } + stok.eolIsSignificant(false); String prm = params.toString().trim(); if (prm.length()>0) { task.setParams(prm); @@ -182,10 +214,8 @@ public class Algorithm { if (stok.ttype!='"') { stok.pushBack(); } else { - stok.nextToken(); name = stok.sval; - stok.nextToken(); - if (stok.ttype!='"' || name==null || name.length()==0) { + if (stok.ttype!='"' || name==null || name.length()==0) { throw new Exception("sequence name problem - "+stok.toString()); } } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java new file mode 100644 index 00000000000..da1de943106 --- /dev/null +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java @@ -0,0 +1,132 @@ +package org.apache.lucene.benchmark.byTask.utils; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; +import java.util.List; + +/** + * A factory to create an analyzer. + * See {@link org.apache.lucene.benchmark.byTask.tasks.AnalyzerFactoryTask} + */ +public final class AnalyzerFactory { + final private List charFilterFactories; + final private TokenizerFactory tokenizerFactory; + final private List tokenFilterFactories; + private String name = null; + private Integer positionIncrementGap = null; + private Integer offsetGap = null; + + public AnalyzerFactory(List charFilterFactories, + TokenizerFactory tokenizerFactory, + List tokenFilterFactories) { + this.charFilterFactories = charFilterFactories; + assert null != tokenizerFactory; + this.tokenizerFactory = tokenizerFactory; + this.tokenFilterFactories = tokenFilterFactories; + } + + public void setName(String name) { + this.name = name; + } + + public void setPositionIncrementGap(Integer positionIncrementGap) { + this.positionIncrementGap = positionIncrementGap; + } + + public void setOffsetGap(Integer offsetGap) { + this.offsetGap = offsetGap; + } + + public Analyzer create() { + return new Analyzer() { + private final Integer positionIncrementGap = AnalyzerFactory.this.positionIncrementGap; + private final Integer offsetGap = AnalyzerFactory.this.offsetGap; + + @Override + public Reader initReader(String fieldName, Reader reader) { + if (charFilterFactories != null && charFilterFactories.size() > 0) { + Reader wrappedReader = reader; + for (CharFilterFactory charFilterFactory : charFilterFactories) { + wrappedReader = charFilterFactory.create(wrappedReader); + } + reader = wrappedReader; + } + return reader; + } + + @Override + protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) { + final Tokenizer tokenizer = tokenizerFactory.create(reader); + TokenStream tokenStream = tokenizer; + for (TokenFilterFactory filterFactory : tokenFilterFactories) { + tokenStream = filterFactory.create(tokenStream); + } + return new TokenStreamComponents(tokenizer, tokenStream); + } + + @Override + public int getPositionIncrementGap(String fieldName) { + return null == positionIncrementGap ? super.getPositionIncrementGap(fieldName) : positionIncrementGap; + } + + @Override + public int getOffsetGap(String fieldName) { + return null == offsetGap ? super.getOffsetGap(fieldName) : offsetGap; + } + }; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("AnalyzerFactory("); + if (null != name) { + sb.append("name:"); + sb.append(name); + sb.append(", "); + } + if (null != positionIncrementGap) { + sb.append("positionIncrementGap:"); + sb.append(positionIncrementGap); + sb.append(", "); + } + if (null != offsetGap) { + sb.append("offsetGap:"); + sb.append(offsetGap); + sb.append(", "); + } + for (CharFilterFactory charFilterFactory: charFilterFactories) { + sb.append(charFilterFactory); + sb.append(", "); + } + sb.append(tokenizerFactory); + for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) { + sb.append(", "); + sb.append(tokenFilterFactory); + } + sb.append(')'); + return sb.toString(); + } +} diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java index 0cc49e32033..1a201b69698 100755 --- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java +++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java @@ -71,6 +71,7 @@ public class TestPerfTasksLogic extends BenchmarkTestCase { public void setUp() throws Exception { super.setUp(); copyToWorkDir("reuters.first20.lines.txt"); + copyToWorkDir("test-mapping-ISOLatin1Accent-partial.txt"); } /** @@ -1020,63 +1021,79 @@ public class TestPerfTasksLogic extends BenchmarkTestCase { } /** - * Test that we can create ShingleAnalyzerWrappers. + * Test that we can create shingle analyzers using AnalyzerFactory. */ public void testShingleAnalyzer() throws Exception { String text = "one,two,three, four five six"; - // Default analyzer, maxShingleSize, and outputUnigrams - Benchmark benchmark = execBenchmark(getShingleConfig("")); + // StandardTokenizer, maxShingleSize, and outputUnigrams + Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig + ("shingle-analyzer", "StandardTokenizer,ShingleFilter")); benchmark.getRunData().getAnalyzer().tokenStream ("bogus", new StringReader(text)).close(); - assertEqualShingle(benchmark.getRunData().getAnalyzer(), text, - new String[] {"one", "one two", "two", "two three", - "three", "three four", "four", "four five", - "five", "five six", "six"}); - // Default analyzer, maxShingleSize = 3, and outputUnigrams = false + BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, + new String[] { "one", "one two", "two", "two three", + "three", "three four", "four", "four five", + "five", "five six", "six" }); + // StandardTokenizer, maxShingleSize = 3, and outputUnigrams = false benchmark = execBenchmark - (getShingleConfig("maxShingleSize:3,outputUnigrams:false")); - assertEqualShingle(benchmark.getRunData().getAnalyzer(), text, - new String[] { "one two", "one two three", "two three", - "two three four", "three four", - "three four five", "four five", - "four five six", "five six" }); - // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams + (getAnalyzerFactoryConfig + ("shingle-analyzer", + "StandardTokenizer,ShingleFilter(maxShingleSize:3,outputUnigrams:false)")); + BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, + new String[] { "one two", "one two three", "two three", + "two three four", "three four", + "three four five", "four five", + "four five six", "five six" }); + // WhitespaceTokenizer, default maxShingleSize and outputUnigrams benchmark = execBenchmark - (getShingleConfig("analyzer:WhitespaceAnalyzer")); - assertEqualShingle(benchmark.getRunData().getAnalyzer(), text, - new String[] { "one,two,three,", "one,two,three, four", - "four", "four five", "five", "five six", - "six" }); + (getAnalyzerFactoryConfig("shingle-analyzer", "WhitespaceTokenizer,ShingleFilter")); + BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, + new String[] { "one,two,three,", "one,two,three, four", + "four", "four five", "five", "five six", + "six" }); - // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false + // WhitespaceTokenizer, maxShingleSize=3 and outputUnigrams=false benchmark = execBenchmark - (getShingleConfig - ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer")); - assertEqualShingle(benchmark.getRunData().getAnalyzer(), text, - new String[] { "one,two,three, four", - "one,two,three, four five", - "four five", "four five six", - "five six" }); + (getAnalyzerFactoryConfig + ("shingle-factory", + "WhitespaceTokenizer,ShingleFilter(outputUnigrams:false,maxShingleSize:3)")); + BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, + new String[] { "one,two,three, four", + "one,two,three, four five", + "four five", "four five six", + "five six" }); } - private void assertEqualShingle - (Analyzer analyzer, String text, String[] expected) throws Exception { - BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, text, expected); - } - - private String[] getShingleConfig(String params) { + private String[] getAnalyzerFactoryConfig(String name, String params) { + final String singleQuoteEscapedName = name.replaceAll("'", "\\\\'"); String algLines[] = { "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), + "work.dir=" + getWorkDir().getAbsolutePath(), "content.source.forever=false", "directory=RAMDirectory", - "NewShingleAnalyzer(" + params + ")", + "AnalyzerFactory(name:'" + singleQuoteEscapedName + "', " + params + ")", + "NewAnalyzer('" + singleQuoteEscapedName + "')", "CreateIndex", "{ \"AddDocs\" AddDoc > : * " }; return algLines; } + + public void testAnalyzerFactory() throws Exception { + String text = "Fortieth, Quarantième, Cuadragésimo"; + Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig + ("ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'", + "positionIncrementGap:100,offsetGap:1111," + +"MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt')," + +"PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\")," + +"StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)")); + BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, + new String[] { "fo", "or", "rt", "ti", "ie", "et", "th", + "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix", "xx", "xx", "xe", + "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs", "si", "io"}); + } private String getReuters20LinesFile() { return getWorkDirResourcePath("reuters.first20.lines.txt"); diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt new file mode 100644 index 00000000000..0ff17dbbe3f --- /dev/null +++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt @@ -0,0 +1,30 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Syntax: +# "source" => "target" +# "source".length() > 0 (source cannot be empty.) +# "target".length() >= 0 (target can be empty.) + +# example: +# "À" => "A" +# "\u00C0" => "A" +# "\u00C0" => "\u0041" +# "ß" => "ss" +# "\t" => " " +# "\n" => "" + +# è => e +"\u00E8" => "e" + +# é => e +"\u00E9" => "e"