LUCENE-2343: add support for benchmarking collectors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@927178 13f79535-47bb-0310-9956-ffa450edef68
2010-03-24 20:49:44 +00:00 · 2010-03-24 20:49:44 +00:00 · eb6e13fe9e
parent 03216a150e
commit eb6e13fe9e
6 changed files with 463 additions and 122 deletions
--- a/lucene/contrib/benchmark/CHANGES.txt
+++ b/lucene/contrib/benchmark/CHANGES.txt
@ -2,7 +2,10 @@ Lucene Benchmark Contrib Change Log

 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.

-2/21/2020
+3/24/2010
+  LUCENE-2343: Added support for benchmarking collectors. (Grant Ingersoll, Shai Erera)
+
+2/21/2010
  LUCENE-2254: Add support to the quality package for running
  experiments with any combination of Title, Description, and Narrative.
  (Robert Muir)
--- a/lucene/contrib/benchmark/conf/collector-small.alg
+++ b/lucene/contrib/benchmark/conf/collector-small.alg
@ -0,0 +1,91 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# multi val params are iterated by NewRound's, added to reports, start with column name.
+
+# collector.class can be:
+#    Fully Qualified Class Name of a Collector with a empty constructor
+#    topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
+#    topScoreDocUnordered - Like above, but allows out of order
+collector.class=coll:topScoreDocOrdered:topScoreDocUnordered:topScoreDocOrdered:topScoreDocUnordered
+
+analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer
+directory=FSDirectory
+#directory=RamDirectory
+
+doc.stored=true
+doc.tokenized=true
+doc.term.vector=false
+log.step=100000
+
+search.num.hits=100000
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.LongToEnglishContentSource
+
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.LongToEnglishQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=true
+# -------------------------------------------------------------------------------------
+
+{ "Rounds"
+
+    ResetSystemErase
+
+    { "Populate"
+        CreateIndex
+        { "MAddDocs" AddDoc } : 200000
+        Optimize
+        CloseIndex
+    }
+
+    OpenReader
+    { "topDocs" SearchWithCollector > : 10
+    CloseReader
+
+#    OpenReader
+#uses an array of search.num.hits size, but can also take in a parameter
+#    { "psc" SearchWithPostSortCollector > : 10
+#    { "psc100" SearchWithPostSortCollector(100) > : 10
+#    { "psc1000" SearchWithPostSortCollector(1000) > : 10
+#    { "psc10000" SearchWithPostSortCollector(10000) > : 10
+#    { "psc50000" SearchWithPostSortCollector(50000) > : 10
+#    CloseReader
+
+    RepSumByPref topDocs
+#    RepSumByPref psc
+#    RepSumByPref psc100
+#    RepSumByPref psc1000
+#    RepSumByPref psc10000
+#    RepSumByPref psc50000
+
+    NewRound
+
+} : 4
+
+#RepSumByNameRound
+#RepSumByName
+#RepSumByPrefRound topDocs
+#RepSumByPrefRound psc
+#RepSumByPrefRound psc100
+#RepSumByPrefRound psc1000
+#RepSumByPrefRound psc10000
+#RepSumByPrefRound psc50000
+
--- a/lucene/contrib/benchmark/conf/collector.alg
+++ b/lucene/contrib/benchmark/conf/collector.alg
@ -0,0 +1,91 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# multi val params are iterated by NewRound's, added to reports, start with column name.
+
+# collector.class can be:
+#    Fully Qualified Class Name of a Collector with a empty constructor
+#    topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
+#    topScoreDocUnordered - Like above, but allows out of order
+collector.class=coll:topScoreDocOrdered:topScoreDocUnordered:topScoreDocOrdered:topScoreDocUnordered
+
+analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer
+directory=FSDirectory
+#directory=RamDirectory
+
+doc.stored=true
+doc.tokenized=true
+doc.term.vector=false
+log.step=100000
+
+search.num.hits=1000000
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.LongToEnglishContentSource
+
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.LongToEnglishQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=true
+# -------------------------------------------------------------------------------------
+
+{ "Rounds"
+
+    ResetSystemErase
+
+    { "Populate"
+        CreateIndex
+        { "MAddDocs" AddDoc } : 2000000
+        Optimize
+        CloseIndex
+    }
+
+    OpenReader
+    { "topDocs" SearchWithCollector > : 10
+    CloseReader
+
+#    OpenReader
+#uses an array of search.num.hits size, but can also take in a parameter
+#    { "psc" SearchWithPostSortCollector > : 10
+#    { "psc100" SearchWithPostSortCollector(100) > : 10
+#    { "psc1000" SearchWithPostSortCollector(1000) > : 10
+#    { "psc10000" SearchWithPostSortCollector(10000) > : 10
+#    { "psc50000" SearchWithPostSortCollector(50000) > : 10
+#    CloseReader
+
+    RepSumByPref topDocs
+#    RepSumByPref psc
+#    RepSumByPref psc100
+#    RepSumByPref psc1000
+#    RepSumByPref psc10000
+#    RepSumByPref psc50000
+
+    NewRound
+
+} : 4
+
+#RepSumByNameRound
+#RepSumByName
+#RepSumByPrefRound topDocs
+#RepSumByPrefRound psc
+#RepSumByPrefRound psc100
+#RepSumByPrefRound psc1000
+#RepSumByPrefRound psc10000
+#RepSumByPrefRound psc50000
+
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
@ -30,10 +30,12 @@ import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.TopFieldCollector;
 import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopScoreDocCollector;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
@ -105,9 +107,10 @@ public abstract class ReadTask extends PerfTask {
      res++;
      Query q = queryMaker.makeQuery();
      Sort sort = getSort();
-      TopDocs hits;
+      TopDocs hits = null;
      final int numHits = numHits();
      if (numHits > 0) {
+        if (withCollector() == false) {
          if (sort != null) {
            Weight w = q.weight(searcher);
            TopFieldCollector collector = TopFieldCollector.create(sort, numHits,
@ -119,9 +122,14 @@ public abstract class ReadTask extends PerfTask {
          } else {
            hits = searcher.search(q, numHits);
          }
+        } else {
+          Collector collector = createCollector();
+          searcher.search(q, null, collector);
+          //hits = collector.topDocs();
+        }

        final String printHitsField = getRunData().getConfig().get("print.hits.field", null);
-        if (printHitsField != null && printHitsField.length() > 0) {
+        if (hits != null && printHitsField != null && printHitsField.length() > 0) {
          if (q instanceof MultiTermQuery) {
            System.out.println("MultiTermQuery term count = " + ((MultiTermQuery) q).getTotalNumberOfTerms());
          }
@ -177,6 +185,9 @@ public abstract class ReadTask extends PerfTask {
    return res;
  }

+  protected Collector createCollector() throws Exception {
+    return TopScoreDocCollector.create(numHits(), true);
+  }


  protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
@ -193,6 +204,10 @@ public abstract class ReadTask extends PerfTask {
   */
  public abstract boolean withSearch();

+  public boolean withCollector(){
+    return false;
+  }
+  

  /**
   * Return true if warming should be performed.
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithCollectorTask.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithCollectorTask.java
@ -0,0 +1,95 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.TopScoreDocCollector;
+
+import java.io.IOException;
+
+/**
+ * Does search w/ a custom collector
+ */
+public class SearchWithCollectorTask extends SearchTask {
+
+  protected String clnName;
+
+  public SearchWithCollectorTask(PerfRunData runData) {
+    super(runData);
+  }
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    //check to make sure either the doc is being stored
+    PerfRunData runData = getRunData();
+    Config config = runData.getConfig();
+    clnName = config.get("collector.class", "");
+  }
+
+  
+
+  @Override
+  public boolean withCollector() {
+    return true;
+  }
+
+  @Override
+  protected Collector createCollector() throws Exception {
+    Collector collector = null;
+    if (clnName.equalsIgnoreCase("topScoreDocOrdered") == true) {
+      collector = TopScoreDocCollector.create(numHits(), true);
+    } else if (clnName.equalsIgnoreCase("topScoreDocUnOrdered") == true) {
+      collector = TopScoreDocCollector.create(numHits(), false);
+    } else if (clnName.length() > 0){
+      collector = Class.forName(clnName).asSubclass(Collector.class).newInstance();
+
+    } else {
+      collector = super.createCollector();
+    }
+    return collector;
+  }
+
+  @Override
+  public QueryMaker getQueryMaker() {
+    return getRunData().getQueryMaker(this);
+  }
+
+  @Override
+  public boolean withRetrieve() {
+    return false;
+  }
+
+  @Override
+  public boolean withSearch() {
+    return true;
+  }
+
+  @Override
+  public boolean withTraverse() {
+    return false;
+  }
+
+  @Override
+  public boolean withWarm() {
+    return false;
+  }
+
+}
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
@ -22,19 +22,19 @@ import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
-import java.util.List;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Properties;
 import java.util.StringTokenizer;

 /**
 * Perf run configuration properties.
- * <p>
+ * <p/>
 * Numeric property containing ":", e.g. "10:100:5" is interpreted
 * as array of numeric values. It is extracted once, on first use, and
 * maintain a round number to return the appropriate value.
- * <p>
+ * <p/>
 * The config property "work.dir" tells where is the root of
 * docs data dirs and indexes dirs. It is set to either of: <ul>
 * <li>value supplied for it in the alg file;</li>
@ -54,6 +54,7 @@ public class Config {

  /**
   * Read both algorithm and config properties.
+   *
   * @param algReader from where to read algorithm and config properties.
   * @throws IOException
   */
@ -121,17 +122,38 @@ public class Config {

  /**
   * Return a string property.
+   *
   * @param name name of property.
   * @param dflt default value.
   * @return a string property.
   */
  public String get(String name, String dflt) {
-    return props.getProperty(name,dflt);
+    String vals[] = (String[]) valByRound.get(name);
+    if (vals != null) {
+      return vals[roundNumber % vals.length];
+    }
+    // done if not by round
+    String sval = props.getProperty(name, dflt);
+    if (sval == null) {
+      return null;
+    }
+    if (sval.indexOf(":") < 0) {
+      return sval;
+    }
+    // first time this prop is extracted by round
+    int k = sval.indexOf(":");
+    String colName = sval.substring(0, k);
+    sval = sval.substring(k + 1);
+    colForValByRound.put(name, colName);
+    vals = propToStringArray(sval);
+    valByRound.put(name, vals);
+    return vals[roundNumber % vals.length];
  }

  /**
   * Set a property.
   * Note: once a multiple values property is set, it can no longer be modified.
+   *
   * @param name  name of property.
   * @param value either single or multiple property value (multiple values are separated by ":")
   * @throws Exception
@ -148,6 +170,7 @@ public class Config {
   * If the property contain ":", e.g. "10:100:5", it is interpreted
   * as array of ints. It is extracted once, on first call
   * to get() it, and a by-round-value is returned.
+   *
   * @param name name of property
   * @param dflt default value
   * @return a int property.
@ -178,6 +201,7 @@ public class Config {
   * If the property contain ":", e.g. "10:100:5", it is interpreted
   * as array of doubles. It is extracted once, on first call
   * to get() it, and a by-round-value is returned.
+   *
   * @param name name of property
   * @param dflt default value
   * @return a double property.
@ -208,6 +232,7 @@ public class Config {
   * If the property contain ":", e.g. "true.true.false", it is interpreted
   * as array of booleans. It is extracted once, on first call
   * to get() it, and a by-round-value is returned.
+   *
   * @param name name of property
   * @param dflt default value
   * @return a int property.
@ -235,6 +260,7 @@ public class Config {

  /**
   * Increment the round number, for config values that are extracted by round number.
+   *
   * @return the new round number.
   */
  public int newRound() {
@ -257,8 +283,12 @@ public class Config {
          int n1 = (roundNumber - 1) % ad.length;
          int n2 = roundNumber % ad.length;
          sb.append("  ").append(name).append(":").append(ad[n1]).append("-->").append(ad[n2]);
-        }
-        else {
+        } else if (a instanceof String[]) {
+          String ad[] = (String[]) a;
+          int n1 = (roundNumber - 1) % ad.length;
+          int n2 = roundNumber % ad.length;
+          sb.append("  ").append(name).append(":").append(ad[n1]).append("-->").append(ad[n2]);
+        } else {
          boolean ab[] = (boolean[]) a;
          int n1 = (roundNumber - 1) % ab.length;
          int n2 = roundNumber % ab.length;
@ -274,6 +304,20 @@ public class Config {
    return roundNumber;
  }

+  private String[] propToStringArray(String s) {
+    if (s.indexOf(":") < 0) {
+      return new String[]{s};
+    }
+
+    ArrayList<String> a = new ArrayList<String>();
+    StringTokenizer st = new StringTokenizer(s, ":");
+    while (st.hasMoreTokens()) {
+      String t = st.nextToken();
+      a.add(t);
+    }
+    return (String[]) a.toArray(new String[a.size()]);
+  }
+
  // extract properties to array, e.g. for "10:100:5" return int[]{10,100,5}. 
  private int[] propToIntArray(String s) {
    if (s.indexOf(":") < 0) {
@ -367,13 +411,15 @@ public class Config {
          int ai[] = (int[]) a;
          int n = roundNum % ai.length;
          sb.append(Format.format(ai[n], template));
-        }
-        else if (a instanceof double[]) {
+        } else if (a instanceof double[]) {
          double ad[] = (double[]) a;
          int n = roundNum % ad.length;
          sb.append(Format.format(2, ad[n], template));
-        }
-        else {
+        } else if (a instanceof String[]) {
+          String ad[] = (String[]) a;
+          int n = roundNum % ad.length;
+          sb.append(ad[n]);
+        } else {
          boolean ab[] = (boolean[]) a;
          int n = roundNum % ab.length;
          sb.append(Format.formatPaddLeft("" + ab[n], template));