LUCENE-2181: add a benchmark for collation

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@898491 13f79535-47bb-0310-9956-ffa450edef68
2010-01-12 20:06:17 +00:00 · 2010-01-12 20:06:17 +00:00 · eaed8e67d2
parent aba4c8bed4
commit eaed8e67d2
10 changed files with 671 additions and 4 deletions
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -4,6 +4,16 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety

 $Id:$

+1/11/2010
+  LUCENE-2181: Add a benchmark for collation. This adds NewLocaleTask,
+  which sets a Locale in the run data for collation to use, and can be
+  used in the future for benchmarking localized range queries and sorts.
+  Also add NewCollationAnalyzerTask, which works with both JDK and ICU
+  Collator implementations. Fix ReadTokensTask to not tokenize fields
+  unless they should be tokenized according to DocMaker config. The 
+  easiest way to run the benchmark is to run 'ant collation'
+  (Steven Rowe via Robert Muir)
+
 12/22/2009
  LUCENE-2178: Allow multiple locations to add to the class path with
  -Dbenchmark.ext.classpath=... when running "ant run-task" (Steven
--- a/contrib/benchmark/build.xml
+++ b/contrib/benchmark/build.xml
@ -24,7 +24,10 @@
        <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
        <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
        <available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>
-
+    	<available file="temp/${top.100k.words.archive.filename}"
+                   property="top.100k.words.archive.present"/>
+    	<available file="${working.dir}/top100k-out" 
+                   property="top.100k.word.files.expanded"/>
    </target>

    <target name="enwiki-files" depends="check-files">
@ -94,6 +97,27 @@
        <untar src="temp/mini_newsgroups.tar" dest="${working.dir}"/>
    </target>

+	<property name="top.100k.words.archive.filename" 
+	          value="top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"/>
+	<property name="top.100k.words.archive.base.url"
+	          value="http://people.apache.org/~rmuir/wikipedia"/>
+	<target name="get-top-100k-words-archive" unless="top.100k.words.archive.present">
+		<mkdir dir="temp"/>
+	    <get src="${top.100k.words.archive.base.url}/${top.100k.words.archive.filename}"
+	         dest="temp/${top.100k.words.archive.filename}"/>
+	</target>
+	<target name="expand-top-100k-word-files" unless="top.100k.word.files.expanded">
+		<mkdir dir="${working.dir}/top100k-out"/>
+	    <untar src="temp/${top.100k.words.archive.filename}"
+	           overwrite="true" compression="bzip2" dest="${working.dir}/top100k-out"/>
+	</target>
+	
+	<target name="top-100k-wiki-word-files" depends="check-files">
+	  <mkdir dir="${working.dir}"/>
+	  <antcall target="get-top-100k-words-archive"/>
+	  <antcall target="expand-top-100k-word-files"/>
+	</target>
+	
    <target name="get-files" depends="check-files">
        <mkdir dir="temp"/>
        <antcall target="get-reuters"/>
@ -141,6 +165,34 @@
        </java>
    </target>

+	<property name="collation.alg.file" location="conf/collation.alg"/>
+	<property name="collation.output.file" 
+	          value="${working.dir}/collation.benchmark.output.txt"/>
+	<property name="collation.jira.output.file" 
+	          value="${working.dir}/collation.bm2jira.output.txt"/>
+	
+	<path id="collation.runtime.classpath">
+	  <path refid="run.classpath"/>
+	  <pathelement path="${common.dir}/build/contrib/icu/classes/java"/>
+      <fileset dir="${common.dir}/contrib/icu/lib" includes="icu4j*.jar"/>
+	</path>
+	
+	<target name="collation" depends="compile,compile-icu,top-100k-wiki-word-files">
+	    <echo>Running contrib/benchmark with alg file: ${collation.alg.file}</echo>
+	    <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark" 
+	          maxmemory="${task.mem}" output="${collation.output.file}">
+	      <classpath refid="collation.runtime.classpath"/>
+	      <arg file="${collation.alg.file}"/>
+	    </java>
+	    <echo>Benchmark output is in file: ${collation.output.file}</echo>
+	    <echo>Converting to JIRA table format...</echo>
+	    <exec executable="perl" output="${collation.jira.output.file}" failonerror="true">
+	      <arg value="scripts/collation.bm2jira.pl"/>
+	      <arg value="${collation.output.file}"/>
+	    </exec>
+	    <echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
+	</target>
+	
    <target name="compile-demo">
      <subant target="compile-demo">
         <fileset dir="${common.dir}" includes="build.xml"/>
@ -151,6 +203,11 @@
         <fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
      </subant>
    </target>
+    <target name="compile-icu">
+      <subant target="compile">
+         <fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
+      </subant>
+    </target>
    <target name="compile-memory">
      <subant target="compile">
         <fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>
--- a/contrib/benchmark/conf/collation.alg
+++ b/contrib/benchmark/conf/collation.alg
@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource
+content.source.encoding=UTF-8
+doc.tokenized=false
+doc.body.tokenized=true
+docs.file=work/top100k-out/top.fr.wikipedia.words.txt
+content.source.forever=false
+log.step=100000
+
+{ "Rounds"
+    -NewAnalyzer(KeywordAnalyzer)
+    -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+    ResetInputs
+    { "FrenchKeyword" { ReadTokens > : * ResetInputs } : 10
+
+    -NewAnalyzer(KeywordAnalyzer)
+    -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+    ResetInputs
+    { "GermanKeyword" { ReadTokens > : * ResetInputs } : 10
+
+    -NewAnalyzer(KeywordAnalyzer)
+    -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+    ResetInputs
+    { "UkrainianKeyword" { ReadTokens > : * ResetInputs } : 10
+ 
+    -NewAnalyzer(KeywordAnalyzer)
+    -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+    ResetInputs
+    { "EnglishKeyword" { ReadTokens > : * ResetInputs } : 10
+ 
+    -NewLocale(fr)
+    -NewCollationAnalyzer
+    -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+    ResetInputs
+    { "FrenchJDK" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(de)
+    -NewCollationAnalyzer
+    -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+    ResetInputs
+    { "GermanJDK" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(uk)
+    -NewCollationAnalyzer
+    -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+    ResetInputs
+    { "UkrainianJDK" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(en)
+    -NewCollationAnalyzer
+    -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+    ResetInputs
+    { "EnglishJDK" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(fr)
+    -NewCollationAnalyzer(impl:icu)
+    -SetProp(docs.file,work/top100k-out/top.fr.wikipedia.words.txt)
+    ResetInputs
+    { "FrenchICU" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(de)
+    -NewCollationAnalyzer(impl:icu)
+    -SetProp(docs.file,work/top100k-out/top.de.wikipedia.words.txt)
+    ResetInputs
+    { "GermanICU" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(uk)
+    -NewCollationAnalyzer(impl:icu)
+    -SetProp(docs.file,work/top100k-out/top.uk.wikipedia.words.txt)
+    ResetInputs
+    { "UkrainianICU" { ReadTokens > : * ResetInputs } : 10
+
+    -NewLocale(en)
+    -NewCollationAnalyzer(impl:icu)
+    -SetProp(docs.file,work/top100k-out/top.en.wikipedia.words.txt)
+    ResetInputs
+    { "EnglishICU" { ReadTokens > : * ResetInputs } : 10
+
+    NewRound
+
+} : 5
+
+RepSumByNameRound
--- a/contrib/benchmark/scripts/collation.bm2jira.pl
+++ b/contrib/benchmark/scripts/collation.bm2jira.pl
@ -0,0 +1,63 @@
+#!/usr/bin/perl
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ----------
+# bm2jira.pl
+#
+# Converts Lucene contrib-benchmark output produced using the 
+# benchmark.collation.alg file into a JIRA-formatted table.
+#
+
+use strict;
+use warnings;
+
+my %min_elapsed = ();
+
+while (<>) {
+  if (/(\S+)(Keyword|JDK|ICU)_\d+\s*([^\s{].*)/) {
+    my $lang = $1;
+    my $analyzer = $2;
+    my $stats = $3;
+    my ($elapsed) = $stats =~ /(?:[\d,.]+[-\s]*){4}([.\d]+)/;
+    $min_elapsed{$analyzer}{$lang} = $elapsed
+      unless (defined($min_elapsed{$analyzer}{$lang})
+              && $elapsed >= $min_elapsed{$analyzer}{$lang});
+  }
+}
+
+# Print out platform info
+print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
+if ($^O =~ /win/i) {
+  print "$^O\n";
+  eval {
+    require Win32;
+    print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
+  };
+  die "Error loading Win32: $@" if ($@);
+} else {
+  print `uname -a 2>&1`;
+}
+
+print "\n||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||\n";
+
+for my $lang (sort keys %{$min_elapsed{ICU}}) {
+  my $ICU = $min_elapsed{ICU}{$lang};
+  my $JDK = $min_elapsed{JDK}{$lang};
+  my $keyword = $min_elapsed{Keyword}{$lang};
+  my $improved = int(100 * ($JDK - $ICU) / ($ICU - $keyword) + 0.5);
+  printf "|$lang|${JDK}s|${ICU}s|${keyword}s|\%d%%|\n", $improved;
+}
--- a/contrib/benchmark/scripts/compare.collation.benchmark.tables.pl
+++ b/contrib/benchmark/scripts/compare.collation.benchmark.tables.pl
@ -0,0 +1,91 @@
+#!/usr/bin/perl
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ------------------------------------------
+# compare.collation.benchmark.jira.tables.pl
+#
+# Takes as cmdline parameters two JIRA-formatted benchmark results, as produced
+# by bm2jira.pl (located in the same directory as this script), and outputs a
+# third JIRA-formatted comparison table, showing the differences between two
+# benchmarking runs' java.text and ICU4J columns, after accounting for the
+# KeywordAnalyzer column; the "ICU4J Improvement" column is ignored.
+#
+# The difference is calculated as a percentage:
+#
+#   100 * (patched-rate - unpatched-rate / unpatched-rate)
+#
+# where the (un)patched-rate is:
+#
+#   1 / ( elapsed-(un)patched-time - elapsed-KeywordAnalyzer-time)
+#
+
+use strict;
+use warnings;
+
+my $usage = "Usage: $0 <unpatched-file> <patched-file>\n";
+
+die $usage unless ($#ARGV == 1 && -f $ARGV[0] && -f $ARGV[1]);
+
+my %stats = ();
+
+open UNPATCHED, "<$ARGV[0]" || die "ERROR opening '$ARGV[0]': $!";
+while (<UNPATCHED>) {
+  # ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||
+  # |English|4.51s|2.47s|1.47s|204%|
+  next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/);
+  my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed)
+    = ($1, $2, $3, $4);
+  $stats{unpatched}{$lang}{jdk} = $jdk_elapsed;
+  $stats{unpatched}{$lang}{icu} = $icu_elapsed;
+  $stats{unpatched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed;
+}
+close UNPATCHED;
+
+open PATCHED, "<$ARGV[1]" || die "ERROR opening '$ARGV[1]': $!";
+while (<PATCHED>) {
+  # ||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||
+  # |English|4.51s|2.47s|1.47s|204%|
+  next unless (/^\|([^|]+)\|([^|s]+)s\|([^|s]+)s\|([^|s]+)s/);
+  my ($lang, $jdk_elapsed, $icu_elapsed, $keyword_analyzer_elapsed)
+    = ($1, $2, $3, $4);
+  $stats{patched}{$lang}{jdk} = $jdk_elapsed;
+  $stats{patched}{$lang}{icu} = $icu_elapsed;
+  $stats{patched}{$lang}{keyword_analyzer} = $keyword_analyzer_elapsed;
+}
+close PATCHED;
+
+print "||Language||java.text improvement||ICU4J improvement||\n";
+for my $lang (sort keys %{$stats{unpatched}}) {
+  my $keyword_analyzer1 = $stats{unpatched}{$lang}{keyword_analyzer};
+  my $jdk1 = $stats{unpatched}{$lang}{jdk};
+  my $jdk_diff1 = $jdk1 - $keyword_analyzer1;
+  my $icu1 = $stats{unpatched}{$lang}{icu};
+  my $icu_diff1 = $icu1 - $keyword_analyzer1;
+
+  my $keyword_analyzer2 = $stats{patched}{$lang}{keyword_analyzer};
+  my $jdk2 = $stats{patched}{$lang}{jdk};
+  my $jdk_diff2 = $jdk2 - $keyword_analyzer2;
+  my $icu2 = $stats{patched}{$lang}{icu};
+  my $icu_diff2 = $icu2 - $keyword_analyzer2;
+
+  my $jdk_impr 
+    = int((1./$jdk_diff2 - 1./$jdk_diff1) / (1./$jdk_diff1) * 1000 + 5) / 10;
+  my $icu_impr
+    = int((1./$icu_diff2 - 1./$icu_diff1) / (1./$icu_diff1) * 1000 + 5) / 10;
+
+  printf "|$lang|%2.1f%%|%2.1f%%|\n", $jdk_impr, $icu_impr;
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask;
 import java.io.File;
 import java.io.IOException;
 import java.util.HashMap;
+import java.util.Locale;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
@ -61,6 +62,7 @@ public class PerfRunData {
  private Directory directory;
  private Analyzer analyzer;
  private DocMaker docMaker;
+  private Locale locale;
  
  // we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
  private HashMap<Class<? extends ReadTask>,QueryMaker> readTaskQueryMaker;
@ -244,6 +246,20 @@ public class PerfRunData {
    return docMaker;
  }

+  /**
+   * @return the locale
+   */
+  public Locale getLocale() {
+    return locale;
+  }
+
+  /**
+   * @param locale the locale to set
+   */
+  public void setLocale(Locale locale) {
+    this.locale = locale;
+  }
+
  /**
   * @return Returns the config.
   */
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewCollationAnalyzerTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewCollationAnalyzerTask.java
@ -0,0 +1,117 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+import java.util.Locale;
+import java.util.StringTokenizer;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+
+/**
+ * Task to support benchmarking collation.
+ * <p>
+ * <ul>
+ *  <li> <code>NewCollationAnalyzer</code> with the default jdk impl
+ *  <li> <code>NewCollationAnalyzer(impl:icu)</code> specify an impl (jdk,icu)
+ * </ul>
+ * </p>
+ */
+public class NewCollationAnalyzerTask extends PerfTask {
+  public enum Implementation { 
+    JDK("org.apache.lucene.collation.CollationKeyAnalyzer", 
+        "java.text.Collator"),
+    ICU("org.apache.lucene.collation.ICUCollationKeyAnalyzer", 
+        "com.ibm.icu.text.Collator");
+    
+    String className;
+    String collatorClassName;
+    
+    Implementation(String className, String collatorClassName) {
+      this.className = className;
+      this.collatorClassName = collatorClassName;
+    }
+  }
+  
+  private Implementation impl = Implementation.JDK;
+
+  public NewCollationAnalyzerTask(PerfRunData runData) {
+    super(runData);
+  }
+
+  static Analyzer createAnalyzer(Locale locale, Implementation impl)
+      throws Exception {
+    final Class<?> collatorClazz = Class.forName(impl.collatorClassName);
+    Method collatorMethod = collatorClazz.getMethod("getInstance",
+        new Class[] {Locale.class});
+    Object collator = collatorMethod.invoke(null, locale);
+    
+    final Class<? extends Analyzer> clazz = Class.forName(impl.className)
+        .asSubclass(Analyzer.class);
+    Constructor<? extends Analyzer> ctor = clazz.getConstructor(collatorClazz);
+    return ctor.newInstance(collator);
+  }
+  
+  @Override
+  public int doLogic() throws Exception {
+    try {
+      Locale locale = getRunData().getLocale();
+      if (locale == null) throw new RuntimeException(
+          "Locale must be set with the NewLocale task!");
+      Analyzer analyzer = createAnalyzer(locale, impl);
+      getRunData().setAnalyzer(analyzer);
+      System.out.println("Changed Analyzer to: "
+          + analyzer.getClass().getName() + "(" + locale + ")");
+    } catch (Exception e) {
+      throw new RuntimeException("Error creating Analyzer: impl=" + impl, e);
+    }
+    return 1;
+  }
+  
+  @Override
+  public void setParams(String params) {
+    super.setParams(params);
+    
+    StringTokenizer st = new StringTokenizer(params, ",");
+    while (st.hasMoreTokens()) {
+      String param = st.nextToken();
+      StringTokenizer expr = new StringTokenizer(param, ":");
+      String key = expr.nextToken();
+      String value = expr.nextToken();
+      // for now we only support the "impl" parameter.
+      // TODO: add strength, decomposition, etc
+      if (key.equals("impl")) {
+        if (value.equalsIgnoreCase("icu"))
+          impl = Implementation.ICU;
+        else if (value.equalsIgnoreCase("jdk"))
+          impl = Implementation.JDK;
+        else
+          throw new RuntimeException("Unknown parameter " + param);
+      } else {
+        throw new RuntimeException("Unknown parameter " + param);
+      }
+    }
+  }
+
+  @Override
+  public boolean supportsParams() {
+    return true;
+  }
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewLocaleTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewLocaleTask.java
@ -0,0 +1,89 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Locale;
+import java.util.StringTokenizer;
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+
+/**
+ * Set a {@link java.util.Locale} for use in benchmarking.
+ * <p>
+ * Locales can be specified in the following ways:
+ * <ul>
+ *  <li><code>de</code>: Language "de"
+ *  <li><code>en,US</code>: Language "en", country "US"
+ *  <li><code>no,NO,NY</code>: Language "no", country "NO", variant "NY" 
+ *  <li><code>ROOT</code>: The root (language-agnostic) Locale
+ *  <li>&lt;empty string&gt;: Erase the Locale (null)
+ * </ul>
+ * </p>
+ */
+public class NewLocaleTask extends PerfTask {
+  private String language;
+  private String country;
+  private String variant;
+  
+  /**
+   * Create a new {@link java.util.Locale} and set it it in the getRunData() for
+   * use by all future tasks.
+   */
+  public NewLocaleTask(PerfRunData runData) {
+    super(runData);
+  }
+
+  static Locale createLocale(String language, String country, String variant) {
+    if (language == null || language.length() == 0) 
+      return null;
+    
+    String lang = language;
+    if (lang.equalsIgnoreCase("ROOT"))
+      lang = ""; // empty language is the root locale in the JDK
+      
+    return new Locale(lang, country, variant);
+  }
+  
+  @Override
+  public int doLogic() throws Exception {
+    Locale locale = createLocale(language, country, variant);
+    getRunData().setLocale(locale);
+    System.out.println("Changed Locale to: " + 
+        (locale == null ? "null" : 
+        (locale.getDisplayName().length() == 0) ? "root locale" : locale));
+    return 1;
+  }
+  
+  @Override
+  public void setParams(String params) {
+    super.setParams(params);
+    language = country = variant = "";
+    StringTokenizer st = new StringTokenizer(params, ",");
+    if (st.hasMoreTokens())
+      language = st.nextToken();
+    if (st.hasMoreTokens())
+      country = st.nextToken();
+    if (st.hasMoreTokens())
+      variant = st.nextToken();
+  }
+
+  @Override
+  public boolean supportsParams() {
+    return true;
+  }
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
@ -67,6 +67,8 @@ public class ReadTokensTask extends PerfTask {
    Analyzer analyzer = getRunData().getAnalyzer();
    int tokenCount = 0;
    for(final Fieldable field : fields) {
+      if (!field.isTokenized()) continue;
+      
      final TokenStream stream;
      final TokenStream streamValue = field.tokenStreamValue();

--- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
+++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
@ -21,15 +21,23 @@ import java.io.StringReader;
 import java.io.File;
 import java.io.FileReader;
 import java.io.BufferedReader;
+import java.text.Collator;
 import java.util.List;
 import java.util.Iterator;
+import java.util.Locale;

+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
 import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
+import org.apache.lucene.collation.CollationKeyAnalyzer;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.index.SerialMergeScheduler;
@ -464,10 +472,14 @@ public class TestPerfTasksLogic extends LuceneTestCase {
    TermDocs termDocs = reader.termDocs();
    int totalTokenCount2 = 0;
    while(terms.next()) {
+      Term term = terms.term();
+      /* not-tokenized, but indexed field */
+      if (term != null && term.field() != DocMaker.ID_FIELD) { 
        termDocs.seek(terms.term());
-      while(termDocs.next())
+        while (termDocs.next())
          totalTokenCount2 += termDocs.freq();
      }
+    }
    reader.close();

    // Make sure they are the same
@ -850,6 +862,119 @@ public class TestPerfTasksLogic extends LuceneTestCase {
    };
  }

+  /**
+   * Test that we can change the Locale in the runData,
+   * that it is parsed as we expect.
+   */
+  public void testLocale() throws Exception {
+    // empty Locale: clear it (null)
+    Benchmark benchmark = execBenchmark(getLocaleConfig(""));
+    assertNull(benchmark.getRunData().getLocale());
+
+    // ROOT locale
+    benchmark = execBenchmark(getLocaleConfig("ROOT"));
+    assertEquals(new Locale(""), benchmark.getRunData().getLocale());
+    
+    // specify just a language 
+    benchmark = execBenchmark(getLocaleConfig("de"));
+    assertEquals(new Locale("de"), benchmark.getRunData().getLocale());
+    
+    // specify language + country
+    benchmark = execBenchmark(getLocaleConfig("en,US"));
+    assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale());
+    
+    // specify language + country + variant
+    benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
+    assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale());
+  }
+   
+  private static String[] getLocaleConfig(String localeParam) {
+    String algLines[] = {
+        "# ----- properties ",
+        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
+        "docs.file=" + getReuters20LinesFile(),
+        "content.source.log.step=3",
+        "content.source.forever=false",
+        "directory=RAMDirectory",
+        "# ----- alg ",
+        "{ \"Rounds\"",
+        "  ResetSystemErase",
+        "  NewLocale(" + localeParam + ")",
+        "  CreateIndex",
+        "  { \"AddDocs\"  AddDoc > : * ",
+        "  NewRound",
+        "} : 1",
+    };
+    return algLines;
+  }
+  
+  /**
+   * Test that we can create CollationAnalyzers.
+   */
+  public void testCollator() throws Exception {
+    // ROOT locale
+    Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk"));
+    CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator
+        .getInstance(new Locale("")));
+    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
+    
+    // specify just a language
+    benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk"));
+    expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de")));
+    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
+    
+    // specify language + country
+    benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk"));
+    expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en",
+        "US")));
+    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
+    
+    // specify language + country + variant
+    benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk"));
+    expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no",
+        "NO", "NY")));
+    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
+  }
+  
+  private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
+      throws Exception {
+    TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text));
+    TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
+    ts1.reset();
+    ts2.reset();
+    TermAttribute termAtt1 = ts1.addAttribute(TermAttribute.class);
+    TermAttribute termAtt2 = ts2.addAttribute(TermAttribute.class);
+    assertTrue(ts1.incrementToken());
+    assertTrue(ts2.incrementToken());
+    assertEquals(termAtt1.term(), termAtt2.term());
+    assertFalse(ts1.incrementToken());
+    assertFalse(ts2.incrementToken());
+    ts1.close();
+    ts2.close();
+  }
+  
+  private static String[] getCollatorConfig(String localeParam, 
+      String collationParam) {
+    String algLines[] = {
+        "# ----- properties ",
+        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
+        "docs.file=" + getReuters20LinesFile(),
+        "content.source.log.step=3",
+        "content.source.forever=false",
+        "directory=RAMDirectory",
+        "# ----- alg ",
+        "{ \"Rounds\"",
+        "  ResetSystemErase",
+        "  NewLocale(" + localeParam + ")",
+        "  NewCollationAnalyzer(" + collationParam + ")",
+        "  CreateIndex",
+        "  { \"AddDocs\"  AddDoc > : * ",
+        "  NewRound",
+        "} : 1",
+    };
+    return algLines;
+  }
+  
  private static String getReuters20LinesFile() {
    return System.getProperty("lucene.common.dir").replace('\\','/') +
      "/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt";