LUCENE-4199: fix more charset/locale violations

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4199@1358564 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-07-07 13:30:52 +00:00
parent 5a6a9e7f80
commit 8834e675ca
11 changed files with 36 additions and 18 deletions

View File

@ -23,6 +23,7 @@ import java.io.Reader;
import org.apache.lucene.benchmark.byTask.utils.Algorithm; import org.apache.lucene.benchmark.byTask.utils.Algorithm;
import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.util.IOUtils;
/** /**
@ -106,7 +107,7 @@ public class Benchmark {
Benchmark benchmark = null; Benchmark benchmark = null;
try { try {
benchmark = new Benchmark(new FileReader(algFile)); benchmark = new Benchmark(IOUtils.getDecodingReader(algFile, IOUtils.CHARSET_UTF_8));
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
System.exit(1); System.exit(1);

View File

@ -18,12 +18,14 @@ package org.apache.lucene.benchmark.byTask.feeds;
*/ */
import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.util.IOUtils;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileFilter; import java.io.FileFilter;
import java.io.FileReader; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DateFormat; import java.text.DateFormat;
import java.text.ParsePosition; import java.text.ParsePosition;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
@ -198,7 +200,7 @@ public class DirContentSource extends ContentSource {
name = f.getCanonicalPath()+"_"+iteration; name = f.getCanonicalPath()+"_"+iteration;
} }
BufferedReader reader = new BufferedReader(new FileReader(f)); BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
String line = null; String line = null;
//First line is the date, 3rd is the title, rest is body //First line is the date, 3rd is the title, rest is body
String dateStr = reader.readLine(); String dateStr = reader.readLine();

View File

@ -25,6 +25,7 @@ import java.io.InputStreamReader;
import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction; import java.nio.charset.CodingErrorAction;
import java.util.HashMap; import java.util.HashMap;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Config;
@ -146,7 +147,7 @@ public class EnwikiContentSource extends ContentSource {
case BODY: case BODY:
body = contents.toString(); body = contents.toString();
//workaround that startswith doesn't have an ignore case option, get at least 20 chars. //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(Locale.ROOT);
if (startsWith.startsWith("#redirect")) { if (startsWith.startsWith("#redirect")) {
body = null; body = null;
} }

View File

@ -5,6 +5,7 @@ import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask; import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import java.io.*; import java.io.*;
@ -59,13 +60,14 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake
{ {
File file = new File(fileName); File file = new File(fileName);
Reader reader = null; Reader reader = null;
// note: we use a decoding reader, so if your queries are screwed up you know
if (file.exists()) { if (file.exists()) {
reader = new FileReader(file); reader = IOUtils.getDecodingReader(file, IOUtils.CHARSET_UTF_8);
} else { } else {
//see if we can find it as a resource //see if we can find it as a resource
InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName); InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName);
if (asStream != null) { if (asStream != null) {
reader = new InputStreamReader(asStream); reader = IOUtils.getDecodingReader(asStream, IOUtils.CHARSET_UTF_8);
} }
} }
if (reader != null) { if (reader != null) {

View File

@ -19,8 +19,9 @@ package org.apache.lucene.benchmark.byTask.feeds;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DateFormat; import java.text.DateFormat;
import java.text.ParsePosition; import java.text.ParsePosition;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
@ -29,6 +30,7 @@ import java.util.Date;
import java.util.Locale; import java.util.Locale;
import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.util.IOUtils;
/** /**
* A {@link ContentSource} reading from the Reuters collection. * A {@link ContentSource} reading from the Reuters collection.
@ -112,7 +114,7 @@ public class ReutersContentSource extends ContentSource {
name = f.getCanonicalPath() + "_" + iteration; name = f.getCanonicalPath() + "_" + iteration;
} }
BufferedReader reader = new BufferedReader(new FileReader(f)); BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
try { try {
// First line is the date, 3rd is the title, rest is body // First line is the date, 3rd is the title, rest is body
String dateStr = reader.readLine(); String dateStr = reader.readLine();

View File

@ -40,6 +40,7 @@ import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.PrintStream; import java.io.PrintStream;
import java.nio.charset.Charset;
/** /**
* Create an index. <br> * Create an index. <br>
@ -182,7 +183,7 @@ public class CreateIndexTask extends PerfTask {
iwc.setInfoStream(System.err); iwc.setInfoStream(System.err);
} else { } else {
File f = new File(infoStreamVal).getAbsoluteFile(); File f = new File(infoStreamVal).getAbsoluteFile();
iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f)))); iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f)), false, Charset.defaultCharset().name()));
} }
} }
IndexWriter writer = new IndexWriter(runData.getDirectory(), iwc); IndexWriter writer = new IndexWriter(runData.getDirectory(), iwc);

View File

@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.text.NumberFormat; import java.text.NumberFormat;
import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.PerfRunData;
@ -487,7 +488,7 @@ public class TaskSequence extends PerfTask {
if (rate>0) { if (rate>0) {
seqName += "_" + rate + (perMin?"/min":"/sec"); seqName += "_" + rate + (perMin?"/min":"/sec");
} }
if (parallel && seqName.toLowerCase().indexOf("par")<0) { if (parallel && seqName.toLowerCase(Locale.ROOT).indexOf("par")<0) {
seqName += "_Par"; seqName += "_Par";
} }
} }

View File

@ -22,6 +22,7 @@ import java.io.StringReader;
import java.lang.reflect.Constructor; import java.lang.reflect.Constructor;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Locale;
import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.tasks.PerfTask; import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
@ -159,7 +160,7 @@ public class Algorithm {
} else { } else {
stok.nextToken(); stok.nextToken();
if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString()); if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
String unit = stok.sval.toLowerCase(); String unit = stok.sval.toLowerCase(Locale.ROOT);
if ("min".equals(unit)) { if ("min".equals(unit)) {
((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min ((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min
} else if ("sec".equals(unit)) { } else if ("sec".equals(unit)) {

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.IOUtils;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
@ -51,7 +52,7 @@ public class QueryDriver {
File topicsFile = new File(args[0]); File topicsFile = new File(args[0]);
File qrelsFile = new File(args[1]); File qrelsFile = new File(args[1]);
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2]), "lucene"); SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2], "UTF-8"), "lucene");
FSDirectory dir = FSDirectory.open(new File(args[3])); FSDirectory dir = FSDirectory.open(new File(args[3]));
String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified. String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
IndexReader reader = DirectoryReader.open(dir); IndexReader reader = DirectoryReader.open(dir);
@ -64,10 +65,10 @@ public class QueryDriver {
// use trec utilities to read trec topics into quality queries // use trec utilities to read trec topics into quality queries
TrecTopicsReader qReader = new TrecTopicsReader(); TrecTopicsReader qReader = new TrecTopicsReader();
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile))); QualityQuery qqs[] = qReader.readQueries(new BufferedReader(IOUtils.getDecodingReader(topicsFile, IOUtils.CHARSET_UTF_8)));
// prepare judge, with trec utilities that read from a QRels file // prepare judge, with trec utilities that read from a QRels file
Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile))); Judge judge = new TrecJudge(new BufferedReader(IOUtils.getDecodingReader(qrelsFile, IOUtils.CHARSET_UTF_8)));
// validate topics & judgments match each other // validate topics & judgments match each other
judge.validateData(qqs, logger); judge.validateData(qqs, logger);

View File

@ -19,12 +19,18 @@ package org.apache.lucene.benchmark.utils;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileFilter; import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader; import java.io.FileReader;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.lucene.util.IOUtils;
/** /**
* Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
@ -73,7 +79,7 @@ public class ExtractReuters {
*/ */
protected void extractFile(File sgmFile) { protected void extractFile(File sgmFile) {
try { try {
BufferedReader reader = new BufferedReader(new FileReader(sgmFile)); BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(sgmFile), IOUtils.CHARSET_UTF_8));
StringBuilder buffer = new StringBuilder(1024); StringBuilder buffer = new StringBuilder(1024);
StringBuilder outBuffer = new StringBuilder(1024); StringBuilder outBuffer = new StringBuilder(1024);
@ -107,7 +113,7 @@ public class ExtractReuters {
File outFile = new File(outputDir, sgmFile.getName() + "-" File outFile = new File(outputDir, sgmFile.getName() + "-"
+ (docNumber++) + ".txt"); + (docNumber++) + ".txt");
// System.out.println("Writing " + outFile); // System.out.println("Writing " + outFile);
FileWriter writer = new FileWriter(outFile); OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outFile), IOUtils.CHARSET_UTF_8);
writer.write(out); writer.write(out);
writer.close(); writer.close();
outBuffer.setLength(0); outBuffer.setLength(0);

View File

@ -219,10 +219,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
CheckIndex checker = new CheckIndex(dir); CheckIndex checker = new CheckIndex(dir);
checker.setInfoStream(new PrintStream(bos)); checker.setInfoStream(new PrintStream(bos, false, "UTF-8"));
CheckIndex.Status indexStatus = checker.checkIndex(); CheckIndex.Status indexStatus = checker.checkIndex();
assertFalse(indexStatus.clean); assertFalse(indexStatus.clean);
assertTrue(bos.toString().contains(IndexFormatTooOldException.class.getName())); assertTrue(bos.toString("UTF-8").contains(IndexFormatTooOldException.class.getName()));
dir.close(); dir.close();
_TestUtil.rmDir(oldIndxeDir); _TestUtil.rmDir(oldIndxeDir);