From 8834e675ca50b60a7b9d6ed0f71ed183d0c5e5b0 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sat, 7 Jul 2012 13:30:52 +0000 Subject: [PATCH] LUCENE-4199: fix more charset/locale violations git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4199@1358564 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/benchmark/byTask/Benchmark.java | 3 ++- .../benchmark/byTask/feeds/DirContentSource.java | 6 ++++-- .../benchmark/byTask/feeds/EnwikiContentSource.java | 3 ++- .../benchmark/byTask/feeds/FileBasedQueryMaker.java | 6 ++++-- .../benchmark/byTask/feeds/ReutersContentSource.java | 6 ++++-- .../lucene/benchmark/byTask/tasks/CreateIndexTask.java | 3 ++- .../lucene/benchmark/byTask/tasks/TaskSequence.java | 3 ++- .../lucene/benchmark/byTask/utils/Algorithm.java | 3 ++- .../lucene/benchmark/quality/trec/QueryDriver.java | 7 ++++--- .../apache/lucene/benchmark/utils/ExtractReuters.java | 10 ++++++++-- .../lucene/index/TestBackwardsCompatibility.java | 4 ++-- 11 files changed, 36 insertions(+), 18 deletions(-) diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java index 7d342d0c93d..81ed4c2df0e 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java @@ -23,6 +23,7 @@ import java.io.Reader; import org.apache.lucene.benchmark.byTask.utils.Algorithm; import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.util.IOUtils; /** @@ -106,7 +107,7 @@ public class Benchmark { Benchmark benchmark = null; try { - benchmark = new Benchmark(new FileReader(algFile)); + benchmark = new Benchmark(IOUtils.getDecodingReader(algFile, IOUtils.CHARSET_UTF_8)); } catch (Exception e) { e.printStackTrace(); System.exit(1); diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java index 4599f90dab5..e0dfa0bd81e 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java @@ -18,12 +18,14 @@ package org.apache.lucene.benchmark.byTask.feeds; */ import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.util.IOUtils; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; -import java.io.FileReader; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStreamReader; import java.text.DateFormat; import java.text.ParsePosition; import java.text.SimpleDateFormat; @@ -198,7 +200,7 @@ public class DirContentSource extends ContentSource { name = f.getCanonicalPath()+"_"+iteration; } - BufferedReader reader = new BufferedReader(new FileReader(f)); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8)); String line = null; //First line is the date, 3rd is the title, rest is body String dateStr = reader.readLine(); diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java index 541e201d416..7ca1fc56d00 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java @@ -25,6 +25,7 @@ import java.io.InputStreamReader; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.util.HashMap; +import java.util.Locale; import java.util.Map; import org.apache.lucene.benchmark.byTask.utils.Config; @@ -146,7 +147,7 @@ public class EnwikiContentSource extends ContentSource { case BODY: body = contents.toString(); //workaround that startswith doesn't have an ignore case option, get at least 20 chars. - String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); + String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(Locale.ROOT); if (startsWith.startsWith("#redirect")) { body = null; } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java index ac17182440a..79b25f9700a 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java @@ -5,6 +5,7 @@ import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.Query; import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.*; @@ -59,13 +60,14 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake { File file = new File(fileName); Reader reader = null; + // note: we use a decoding reader, so if your queries are screwed up you know if (file.exists()) { - reader = new FileReader(file); + reader = IOUtils.getDecodingReader(file, IOUtils.CHARSET_UTF_8); } else { //see if we can find it as a resource InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName); if (asStream != null) { - reader = new InputStreamReader(asStream); + reader = IOUtils.getDecodingReader(asStream, IOUtils.CHARSET_UTF_8); } } if (reader != null) { diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java index 6b3bd09be68..fe7bb42620c 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java @@ -19,8 +19,9 @@ package org.apache.lucene.benchmark.byTask.feeds; import java.io.BufferedReader; import java.io.File; -import java.io.FileReader; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStreamReader; import java.text.DateFormat; import java.text.ParsePosition; import java.text.SimpleDateFormat; @@ -29,6 +30,7 @@ import java.util.Date; import java.util.Locale; import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.util.IOUtils; /** * A {@link ContentSource} reading from the Reuters collection. @@ -112,7 +114,7 @@ public class ReutersContentSource extends ContentSource { name = f.getCanonicalPath() + "_" + iteration; } - BufferedReader reader = new BufferedReader(new FileReader(f)); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8)); try { // First line is the date, 3rd is the title, rest is body String dateStr = reader.readLine(); diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java index 9f30059da9f..7f8c99418b4 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java @@ -40,6 +40,7 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; +import java.nio.charset.Charset; /** * Create an index.
@@ -182,7 +183,7 @@ public class CreateIndexTask extends PerfTask { iwc.setInfoStream(System.err); } else { File f = new File(infoStreamVal).getAbsoluteFile(); - iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f)))); + iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f)), false, Charset.defaultCharset().name())); } } IndexWriter writer = new IndexWriter(runData.getDirectory(), iwc); diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java index 068cc058b0b..4f3abec4d41 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java @@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.byTask.tasks; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.text.NumberFormat; import org.apache.lucene.benchmark.byTask.PerfRunData; @@ -487,7 +488,7 @@ public class TaskSequence extends PerfTask { if (rate>0) { seqName += "_" + rate + (perMin?"/min":"/sec"); } - if (parallel && seqName.toLowerCase().indexOf("par")<0) { + if (parallel && seqName.toLowerCase(Locale.ROOT).indexOf("par")<0) { seqName += "_Par"; } } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java index 2c8b3a4ad77..78ff2d44d4c 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java @@ -22,6 +22,7 @@ import java.io.StringReader; import java.lang.reflect.Constructor; import java.util.ArrayList; import java.util.Arrays; +import java.util.Locale; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.tasks.PerfTask; @@ -159,7 +160,7 @@ public class Algorithm { } else { stok.nextToken(); if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString()); - String unit = stok.sval.toLowerCase(); + String unit = stok.sval.toLowerCase(Locale.ROOT); if ("min".equals(unit)) { ((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min } else if ("sec".equals(unit)) { diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java index de3047a850e..a1ee941a5b0 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.IOUtils; import java.io.BufferedReader; import java.io.File; @@ -51,7 +52,7 @@ public class QueryDriver { File topicsFile = new File(args[0]); File qrelsFile = new File(args[1]); - SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2]), "lucene"); + SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2], "UTF-8"), "lucene"); FSDirectory dir = FSDirectory.open(new File(args[3])); String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified. IndexReader reader = DirectoryReader.open(dir); @@ -64,10 +65,10 @@ public class QueryDriver { // use trec utilities to read trec topics into quality queries TrecTopicsReader qReader = new TrecTopicsReader(); - QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile))); + QualityQuery qqs[] = qReader.readQueries(new BufferedReader(IOUtils.getDecodingReader(topicsFile, IOUtils.CHARSET_UTF_8))); // prepare judge, with trec utilities that read from a QRels file - Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile))); + Judge judge = new TrecJudge(new BufferedReader(IOUtils.getDecodingReader(qrelsFile, IOUtils.CHARSET_UTF_8))); // validate topics & judgments match each other judge.validateData(qqs, logger); diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java index 82fa2a01b25..dade9976763 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java @@ -19,12 +19,18 @@ package org.apache.lucene.benchmark.utils; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.lucene.util.IOUtils; + /** * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body @@ -73,7 +79,7 @@ public class ExtractReuters { */ protected void extractFile(File sgmFile) { try { - BufferedReader reader = new BufferedReader(new FileReader(sgmFile)); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(sgmFile), IOUtils.CHARSET_UTF_8)); StringBuilder buffer = new StringBuilder(1024); StringBuilder outBuffer = new StringBuilder(1024); @@ -107,7 +113,7 @@ public class ExtractReuters { File outFile = new File(outputDir, sgmFile.getName() + "-" + (docNumber++) + ".txt"); // System.out.println("Writing " + outFile); - FileWriter writer = new FileWriter(outFile); + OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outFile), IOUtils.CHARSET_UTF_8); writer.write(out); writer.close(); outBuffer.setLength(0); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 1e7bff43972..b0e9c7804dd 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -219,10 +219,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase { ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); CheckIndex checker = new CheckIndex(dir); - checker.setInfoStream(new PrintStream(bos)); + checker.setInfoStream(new PrintStream(bos, false, "UTF-8")); CheckIndex.Status indexStatus = checker.checkIndex(); assertFalse(indexStatus.clean); - assertTrue(bos.toString().contains(IndexFormatTooOldException.class.getName())); + assertTrue(bos.toString("UTF-8").contains(IndexFormatTooOldException.class.getName())); dir.close(); _TestUtil.rmDir(oldIndxeDir);