LUCENE-2343: add support for benchmarking collectors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@927178 13f79535-47bb-0310-9956-ffa450edef68
2010-03-24 20:49:44 +00:00 · 2010-03-24 20:49:44 +00:00 · eb6e13fe9e
parent 03216a150e
commit eb6e13fe9e
6 changed files with 463 additions and 122 deletions
--- a/lucene/contrib/benchmark/CHANGES.txt
+++ b/lucene/contrib/benchmark/CHANGES.txt
@ -2,7 +2,10 @@ Lucene Benchmark Contrib Change Log
 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
-2/21/2020
+3/24/2010
  LUCENE-2343: Added support for benchmarking collectors. (Grant Ingersoll, Shai Erera)
 2/21/2010
  LUCENE-2254: Add support to the quality package for running
  experiments with any combination of Title, Description, and Narrative.
  (Robert Muir)
--- a/lucene/contrib/benchmark/conf/collector-small.alg
+++ b/lucene/contrib/benchmark/conf/collector-small.alg
@ -0,0 +1,91 @@
 #/**
 # * Licensed to the Apache Software Foundation (ASF) under one or more
 # * contributor license agreements.  See the NOTICE file distributed with
 # * this work for additional information regarding copyright ownership.
 # * The ASF licenses this file to You under the Apache License, Version 2.0
 # * (the "License"); you may not use this file except in compliance with
 # * the License.  You may obtain a copy of the License at
 # *
 # *     http://www.apache.org/licenses/LICENSE-2.0
 # *
 # * Unless required by applicable law or agreed to in writing, software
 # * distributed under the License is distributed on an "AS IS" BASIS,
 # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # * See the License for the specific language governing permissions and
 # * limitations under the License.
 # */
 # -------------------------------------------------------------------------------------
 # multi val params are iterated by NewRound's, added to reports, start with column name.
 # collector.class can be:
 #    Fully Qualified Class Name of a Collector with a empty constructor
 #    topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
 #    topScoreDocUnordered - Like above, but allows out of order
 collector.class=coll:topScoreDocOrdered:topScoreDocUnordered:topScoreDocOrdered:topScoreDocUnordered
 analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer
 directory=FSDirectory
 #directory=RamDirectory
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
 log.step=100000
 search.num.hits=100000
 content.source=org.apache.lucene.benchmark.byTask.feeds.LongToEnglishContentSource
 query.maker=org.apache.lucene.benchmark.byTask.feeds.LongToEnglishQueryMaker
 # task at this depth or less would print when they start
 task.max.depth.log=2
 log.queries=true
 # -------------------------------------------------------------------------------------
 { "Rounds"
    ResetSystemErase
    { "Populate"
        CreateIndex
        { "MAddDocs" AddDoc } : 200000
        Optimize
        CloseIndex
    }
    OpenReader
    { "topDocs" SearchWithCollector > : 10
    CloseReader
 #    OpenReader
 #uses an array of search.num.hits size, but can also take in a parameter
 #    { "psc" SearchWithPostSortCollector > : 10
 #    { "psc100" SearchWithPostSortCollector(100) > : 10
 #    { "psc1000" SearchWithPostSortCollector(1000) > : 10
 #    { "psc10000" SearchWithPostSortCollector(10000) > : 10
 #    { "psc50000" SearchWithPostSortCollector(50000) > : 10
 #    CloseReader
    RepSumByPref topDocs
 #    RepSumByPref psc
 #    RepSumByPref psc100
 #    RepSumByPref psc1000
 #    RepSumByPref psc10000
 #    RepSumByPref psc50000
    NewRound
 } : 4
 #RepSumByNameRound
 #RepSumByName
 #RepSumByPrefRound topDocs
 #RepSumByPrefRound psc
 #RepSumByPrefRound psc100
 #RepSumByPrefRound psc1000
 #RepSumByPrefRound psc10000
 #RepSumByPrefRound psc50000
--- a/lucene/contrib/benchmark/conf/collector.alg
+++ b/lucene/contrib/benchmark/conf/collector.alg
@ -0,0 +1,91 @@
 #/**
 # * Licensed to the Apache Software Foundation (ASF) under one or more
 # * contributor license agreements.  See the NOTICE file distributed with
 # * this work for additional information regarding copyright ownership.
 # * The ASF licenses this file to You under the Apache License, Version 2.0
 # * (the "License"); you may not use this file except in compliance with
 # * the License.  You may obtain a copy of the License at
 # *
 # *     http://www.apache.org/licenses/LICENSE-2.0
 # *
 # * Unless required by applicable law or agreed to in writing, software
 # * distributed under the License is distributed on an "AS IS" BASIS,
 # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # * See the License for the specific language governing permissions and
 # * limitations under the License.
 # */
 # -------------------------------------------------------------------------------------
 # multi val params are iterated by NewRound's, added to reports, start with column name.
 # collector.class can be:
 #    Fully Qualified Class Name of a Collector with a empty constructor
 #    topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
 #    topScoreDocUnordered - Like above, but allows out of order
 collector.class=coll:topScoreDocOrdered:topScoreDocUnordered:topScoreDocOrdered:topScoreDocUnordered
 analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer
 directory=FSDirectory
 #directory=RamDirectory
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
 log.step=100000
 search.num.hits=1000000
 content.source=org.apache.lucene.benchmark.byTask.feeds.LongToEnglishContentSource
 query.maker=org.apache.lucene.benchmark.byTask.feeds.LongToEnglishQueryMaker
 # task at this depth or less would print when they start
 task.max.depth.log=2
 log.queries=true
 # -------------------------------------------------------------------------------------
 { "Rounds"
    ResetSystemErase
    { "Populate"
        CreateIndex
        { "MAddDocs" AddDoc } : 2000000
        Optimize
        CloseIndex
    }
    OpenReader
    { "topDocs" SearchWithCollector > : 10
    CloseReader
 #    OpenReader
 #uses an array of search.num.hits size, but can also take in a parameter
 #    { "psc" SearchWithPostSortCollector > : 10
 #    { "psc100" SearchWithPostSortCollector(100) > : 10
 #    { "psc1000" SearchWithPostSortCollector(1000) > : 10
 #    { "psc10000" SearchWithPostSortCollector(10000) > : 10
 #    { "psc50000" SearchWithPostSortCollector(50000) > : 10
 #    CloseReader
    RepSumByPref topDocs
 #    RepSumByPref psc
 #    RepSumByPref psc100
 #    RepSumByPref psc1000
 #    RepSumByPref psc10000
 #    RepSumByPref psc50000
    NewRound
 } : 4
 #RepSumByNameRound
 #RepSumByName
 #RepSumByPrefRound topDocs
 #RepSumByPrefRound psc
 #RepSumByPrefRound psc100
 #RepSumByPrefRound psc1000
 #RepSumByPrefRound psc10000
 #RepSumByPrefRound psc50000
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
@ -30,10 +30,12 @@ import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.TopFieldCollector;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopScoreDocCollector;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
@ -105,9 +107,10 @@ public abstract class ReadTask extends PerfTask {
      res++;
      Query q = queryMaker.makeQuery();
      Sort sort = getSort();
-      TopDocs hits;
+      TopDocs hits = null;
      final int numHits = numHits();
      if (numHits > 0) {
        if (withCollector() == false) {
          if (sort != null) {
            Weight w = q.weight(searcher);
            TopFieldCollector collector = TopFieldCollector.create(sort, numHits,
@ -119,9 +122,14 @@ public abstract class ReadTask extends PerfTask {
          } else {
            hits = searcher.search(q, numHits);
          }
        } else {
          Collector collector = createCollector();
          searcher.search(q, null, collector);
          //hits = collector.topDocs();
        }
        final String printHitsField = getRunData().getConfig().get("print.hits.field", null);
-        if (printHitsField != null && printHitsField.length() > 0) {
+        if (hits != null && printHitsField != null && printHitsField.length() > 0) {
          if (q instanceof MultiTermQuery) {
            System.out.println("MultiTermQuery term count = " + ((MultiTermQuery) q).getTotalNumberOfTerms());
          }
@ -177,6 +185,9 @@ public abstract class ReadTask extends PerfTask {
    return res;
  }
  protected Collector createCollector() throws Exception {
    return TopScoreDocCollector.create(numHits(), true);
  }
  protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
@ -193,6 +204,10 @@ public abstract class ReadTask extends PerfTask {
   */
  public abstract boolean withSearch();
  public boolean withCollector(){
    return false;
  }
  /**
   * Return true if warming should be performed.
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithCollectorTask.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithCollectorTask.java
@ -0,0 +1,95 @@
 package org.apache.lucene.benchmark.byTask.tasks;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.TopScoreDocCollector;
 import java.io.IOException;
 /**
 * Does search w/ a custom collector
 */
 public class SearchWithCollectorTask extends SearchTask {
  protected String clnName;
  public SearchWithCollectorTask(PerfRunData runData) {
    super(runData);
  }
  @Override
  public void setup() throws Exception {
    super.setup();
    //check to make sure either the doc is being stored
    PerfRunData runData = getRunData();
    Config config = runData.getConfig();
    clnName = config.get("collector.class", "");
  }
  @Override
  public boolean withCollector() {
    return true;
  }
  @Override
  protected Collector createCollector() throws Exception {
    Collector collector = null;
    if (clnName.equalsIgnoreCase("topScoreDocOrdered") == true) {
      collector = TopScoreDocCollector.create(numHits(), true);
    } else if (clnName.equalsIgnoreCase("topScoreDocUnOrdered") == true) {
      collector = TopScoreDocCollector.create(numHits(), false);
    } else if (clnName.length() > 0){
      collector = Class.forName(clnName).asSubclass(Collector.class).newInstance();
    } else {
      collector = super.createCollector();
    }
    return collector;
  }
  @Override
  public QueryMaker getQueryMaker() {
    return getRunData().getQueryMaker(this);
  }
  @Override
  public boolean withRetrieve() {
    return false;
  }
  @Override
  public boolean withSearch() {
    return true;
  }
  @Override
  public boolean withTraverse() {
    return false;
  }
  @Override
  public boolean withWarm() {
    return false;
  }
 }
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
@ -22,19 +22,19 @@ import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Properties;
 import java.util.StringTokenizer;
 /**
 * Perf run configuration properties.
- * <p>
+ * <p/>
 * Numeric property containing ":", e.g. "10:100:5" is interpreted
 * as array of numeric values. It is extracted once, on first use, and
 * maintain a round number to return the appropriate value.
- * <p>
+ * <p/>
 * The config property "work.dir" tells where is the root of
 * docs data dirs and indexes dirs. It is set to either of: <ul>
 * <li>value supplied for it in the alg file;</li>
@ -54,6 +54,7 @@ public class Config {
  /**
   * Read both algorithm and config properties.
   *
   * @param algReader from where to read algorithm and config properties.
   * @throws IOException
   */
@ -121,17 +122,38 @@ public class Config {
  /**
   * Return a string property.
   *
   * @param name name of property.
   * @param dflt default value.
   * @return a string property.
   */
  public String get(String name, String dflt) {
-    return props.getProperty(name,dflt);
+    String vals[] = (String[]) valByRound.get(name);
    if (vals != null) {
      return vals[roundNumber % vals.length];
    }
    // done if not by round
    String sval = props.getProperty(name, dflt);
    if (sval == null) {
      return null;
    }
    if (sval.indexOf(":") < 0) {
      return sval;
    }
    // first time this prop is extracted by round
    int k = sval.indexOf(":");
    String colName = sval.substring(0, k);
    sval = sval.substring(k + 1);
    colForValByRound.put(name, colName);
    vals = propToStringArray(sval);
    valByRound.put(name, vals);
    return vals[roundNumber % vals.length];
  }
  /**
   * Set a property.
   * Note: once a multiple values property is set, it can no longer be modified.
   *
   * @param name  name of property.
   * @param value either single or multiple property value (multiple values are separated by ":")
   * @throws Exception
@ -148,6 +170,7 @@ public class Config {
   * If the property contain ":", e.g. "10:100:5", it is interpreted
   * as array of ints. It is extracted once, on first call
   * to get() it, and a by-round-value is returned.
   *
   * @param name name of property
   * @param dflt default value
   * @return a int property.
@ -178,6 +201,7 @@ public class Config {
   * If the property contain ":", e.g. "10:100:5", it is interpreted
   * as array of doubles. It is extracted once, on first call
   * to get() it, and a by-round-value is returned.
   *
   * @param name name of property
   * @param dflt default value
   * @return a double property.
@ -208,6 +232,7 @@ public class Config {
   * If the property contain ":", e.g. "true.true.false", it is interpreted
   * as array of booleans. It is extracted once, on first call
   * to get() it, and a by-round-value is returned.
   *
   * @param name name of property
   * @param dflt default value
   * @return a int property.
@ -235,6 +260,7 @@ public class Config {
  /**
   * Increment the round number, for config values that are extracted by round number.
   *
   * @return the new round number.
   */
  public int newRound() {
@ -257,8 +283,12 @@ public class Config {
          int n1 = (roundNumber - 1) % ad.length;
          int n2 = roundNumber % ad.length;
          sb.append("  ").append(name).append(":").append(ad[n1]).append("-->").append(ad[n2]);
-        }
+        } else if (a instanceof String[]) {
-        else {
+          String ad[] = (String[]) a;
          int n1 = (roundNumber - 1) % ad.length;
          int n2 = roundNumber % ad.length;
          sb.append("  ").append(name).append(":").append(ad[n1]).append("-->").append(ad[n2]);
        } else {
          boolean ab[] = (boolean[]) a;
          int n1 = (roundNumber - 1) % ab.length;
          int n2 = roundNumber % ab.length;
@ -274,6 +304,20 @@ public class Config {
    return roundNumber;
  }
  private String[] propToStringArray(String s) {
    if (s.indexOf(":") < 0) {
      return new String[]{s};
    }
    ArrayList<String> a = new ArrayList<String>();
    StringTokenizer st = new StringTokenizer(s, ":");
    while (st.hasMoreTokens()) {
      String t = st.nextToken();
      a.add(t);
    }
    return (String[]) a.toArray(new String[a.size()]);
  }
  // extract properties to array, e.g. for "10:100:5" return int[]{10,100,5}. 
  private int[] propToIntArray(String s) {
    if (s.indexOf(":") < 0) {
@ -367,13 +411,15 @@ public class Config {
          int ai[] = (int[]) a;
          int n = roundNum % ai.length;
          sb.append(Format.format(ai[n], template));
-        }
+        } else if (a instanceof double[]) {
        else if (a instanceof double[]) {
          double ad[] = (double[]) a;
          int n = roundNum % ad.length;
          sb.append(Format.format(2, ad[n], template));
-        }
+        } else if (a instanceof String[]) {
-        else {
+          String ad[] = (String[]) a;
          int n = roundNum % ad.length;
          sb.append(ad[n]);
        } else {
          boolean ab[] = (boolean[]) a;
          int n = roundNum % ab.length;
          sb.append(Format.formatPaddLeft("" + ab[n], template));