LUCENE-1730: Fix TrecContentSource to use ISO-8859-1 when reading the TREC files, unless a different encoding is specified. Additionally, ContentSource now supports a content.source.encoding parameter in the configuration file.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@791528 13f79535-47bb-0310-9956-ffa450edef68
2009-07-06 15:56:39 +00:00 · 2009-07-06 15:56:39 +00:00 · f780f77366
parent 705f099238
commit f780f77366
5 changed files with 27 additions and 8 deletions
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety

 $Id:$

+7/6/2009
+  LUCENE-1730: Fix TrecContentSource to use ISO-8859-1 when reading the TREC files, 
+  unless a different encoding is specified. Additionally, ContentSource now supports 
+  a content.source.encoding parameter in the configuration file. 
+  (Shai Erera via Mark Miller)
+
 6/26/2009
  LUCENE-1716: Added the following support: 
  doc.tokenized.norms: specifies whether to store norms
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java
@ -34,9 +34,9 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
 /**
 * Represents content from a specified source, such as TREC, Reuters etc. A
 * {@link ContentSource} is responsible for creating {@link DocData} objects for
- * its documents to be consumed by {@link ToDeleteDocMaker}. It also keeps track of
- * various statistics, such as how many documents were generated, size in bytes
- * etc.
+ * its documents to be consumed by {@link ToDeleteDocMaker}. It also keeps track
+ * of various statistics, such as how many documents were generated, size in
+ * bytes etc.
 * <p>
 * Supports the following configuration parameters:
 * <ul>
@ -44,6 +44,9 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
 * forever (<b>default=true</b>).
 * <li><b>content.source.verbose</b> - specifies whether messages should be
 * output by the content source (<b>default=false</b>).
+ * <li><b>content.source.encoding</b> - specifies which encoding to use when
+ * reading the files of that content source. Certain implementations may define
+ * a default value if this parameter is not specified. (<b>default=null</b>).
 * <li><b>content.source.log.step</b> - specifies for how many documents a
 * message should be logged. If set to 0 it means no logging should occur.
 * <b>NOTE:</b> if verbose is set to false, logging should not occur even if
@ -71,6 +74,7 @@ public abstract class ContentSource {
  protected boolean forever;
  protected int logStep;
  protected boolean verbose;
+  protected String encoding;
  
  private CompressorStreamFactory csFactory = new CompressorStreamFactory();

@ -196,6 +200,7 @@ public abstract class ContentSource {
    forever = config.get("content.source.forever", true);
    logStep = config.get("content.source.log.step", 0);
    verbose = config.get("content.source.verbose", false);
+    encoding = config.get("content.source.encoding", null);
  }

 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
@ -39,6 +39,7 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
 * Config properties:
 * <ul>
 * <li>docs.file=&lt;path to the file&gt;
+ * <li>content.source.encoding - default to UTF-8.
 * </ul>
 */
 public class LineDocSource extends ContentSource {
@ -54,7 +55,7 @@ public class LineDocSource extends ContentSource {
        reader.close();
      }
      InputStream is = getInputStream(file);
-      reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE);
+      reader = new BufferedReader(new InputStreamReader(is, encoding), BUFFER_SIZE);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
@ -111,6 +112,9 @@ public class LineDocSource extends ContentSource {
      throw new IllegalArgumentException("docs.file must be set");
    }
    file = new File(fileName).getAbsoluteFile();
+    if (encoding == null) {
+      encoding = "UTF-8";
+    }
  }

 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
@ -47,10 +47,11 @@ import org.apache.lucene.benchmark.byTask.utils.StringBufferReader;
 * (<b>default=trec</b>).
 * <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
 * parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
+ * <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
 * </ul>
 */
 public class TrecContentSource extends ContentSource {
-  // TODO (3.0): change StringBuffer to StringBuffer
+  // TODO (3.0): change StringBuffer to StringBuilder

  private static final class DateFormatInfo {
    DateFormat[] dfs;
@ -181,8 +182,8 @@ public class TrecContentSource extends ContentSource {
        System.out.println("opening: " + f + " length: " + f.length());
      }
      try {
-        GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), 1 << 16);
-        reader = new BufferedReader(new InputStreamReader(zis), 1 << 16);
+        GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), BUFFER_SIZE);
+        reader = new BufferedReader(new InputStreamReader(zis, encoding), BUFFER_SIZE);
        return;
      } catch (Exception e) {
        retries++;
@ -334,6 +335,9 @@ public class TrecContentSource extends ContentSource {
      // Should not get here. Throw runtime exception.
      throw new RuntimeException(e);
    }
+    if (encoding == null) {
+      encoding = "ISO-8859-1";
+    }
  }

 }
--- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
+++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
@ -40,7 +40,7 @@ public class TrecContentSourceTest extends TestCase {
      this.forever = forever;
    }
    
-    protected void openNextFile() throws NoMoreDataException, IOException {
+    void openNextFile() throws NoMoreDataException, IOException {
      if (reader != null) {
        if (!forever) {
          throw new NoMoreDataException();