mirror of https://github.com/apache/lucene.git
LUCENE-4199: fix more charset/locale violations
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4199@1358564 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5a6a9e7f80
commit
8834e675ca
|
@ -23,6 +23,7 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
|
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -106,7 +107,7 @@ public class Benchmark {
|
||||||
|
|
||||||
Benchmark benchmark = null;
|
Benchmark benchmark = null;
|
||||||
try {
|
try {
|
||||||
benchmark = new Benchmark(new FileReader(algFile));
|
benchmark = new Benchmark(IOUtils.getDecodingReader(algFile, IOUtils.CHARSET_UTF_8));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
System.exit(1);
|
System.exit(1);
|
||||||
|
|
|
@ -18,12 +18,14 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileFilter;
|
import java.io.FileFilter;
|
||||||
import java.io.FileReader;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.text.DateFormat;
|
import java.text.DateFormat;
|
||||||
import java.text.ParsePosition;
|
import java.text.ParsePosition;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
|
@ -198,7 +200,7 @@ public class DirContentSource extends ContentSource {
|
||||||
name = f.getCanonicalPath()+"_"+iteration;
|
name = f.getCanonicalPath()+"_"+iteration;
|
||||||
}
|
}
|
||||||
|
|
||||||
BufferedReader reader = new BufferedReader(new FileReader(f));
|
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
|
||||||
String line = null;
|
String line = null;
|
||||||
//First line is the date, 3rd is the title, rest is body
|
//First line is the date, 3rd is the title, rest is body
|
||||||
String dateStr = reader.readLine();
|
String dateStr = reader.readLine();
|
||||||
|
|
|
@ -25,6 +25,7 @@ import java.io.InputStreamReader;
|
||||||
import java.nio.charset.CharsetDecoder;
|
import java.nio.charset.CharsetDecoder;
|
||||||
import java.nio.charset.CodingErrorAction;
|
import java.nio.charset.CodingErrorAction;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
@ -146,7 +147,7 @@ public class EnwikiContentSource extends ContentSource {
|
||||||
case BODY:
|
case BODY:
|
||||||
body = contents.toString();
|
body = contents.toString();
|
||||||
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
|
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
|
||||||
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
|
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(Locale.ROOT);
|
||||||
if (startsWith.startsWith("#redirect")) {
|
if (startsWith.startsWith("#redirect")) {
|
||||||
body = null;
|
body = null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,6 +5,7 @@ import org.apache.lucene.queryparser.classic.ParseException;
|
||||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask;
|
import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
@ -59,13 +60,14 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake
|
||||||
{
|
{
|
||||||
File file = new File(fileName);
|
File file = new File(fileName);
|
||||||
Reader reader = null;
|
Reader reader = null;
|
||||||
|
// note: we use a decoding reader, so if your queries are screwed up you know
|
||||||
if (file.exists()) {
|
if (file.exists()) {
|
||||||
reader = new FileReader(file);
|
reader = IOUtils.getDecodingReader(file, IOUtils.CHARSET_UTF_8);
|
||||||
} else {
|
} else {
|
||||||
//see if we can find it as a resource
|
//see if we can find it as a resource
|
||||||
InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName);
|
InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName);
|
||||||
if (asStream != null) {
|
if (asStream != null) {
|
||||||
reader = new InputStreamReader(asStream);
|
reader = IOUtils.getDecodingReader(asStream, IOUtils.CHARSET_UTF_8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (reader != null) {
|
if (reader != null) {
|
||||||
|
|
|
@ -19,8 +19,9 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.text.DateFormat;
|
import java.text.DateFormat;
|
||||||
import java.text.ParsePosition;
|
import java.text.ParsePosition;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
|
@ -29,6 +30,7 @@ import java.util.Date;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link ContentSource} reading from the Reuters collection.
|
* A {@link ContentSource} reading from the Reuters collection.
|
||||||
|
@ -112,7 +114,7 @@ public class ReutersContentSource extends ContentSource {
|
||||||
name = f.getCanonicalPath() + "_" + iteration;
|
name = f.getCanonicalPath() + "_" + iteration;
|
||||||
}
|
}
|
||||||
|
|
||||||
BufferedReader reader = new BufferedReader(new FileReader(f));
|
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
|
||||||
try {
|
try {
|
||||||
// First line is the date, 3rd is the title, rest is body
|
// First line is the date, 3rd is the title, rest is body
|
||||||
String dateStr = reader.readLine();
|
String dateStr = reader.readLine();
|
||||||
|
|
|
@ -40,6 +40,7 @@ import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an index. <br>
|
* Create an index. <br>
|
||||||
|
@ -182,7 +183,7 @@ public class CreateIndexTask extends PerfTask {
|
||||||
iwc.setInfoStream(System.err);
|
iwc.setInfoStream(System.err);
|
||||||
} else {
|
} else {
|
||||||
File f = new File(infoStreamVal).getAbsoluteFile();
|
File f = new File(infoStreamVal).getAbsoluteFile();
|
||||||
iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f))));
|
iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f)), false, Charset.defaultCharset().name()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
IndexWriter writer = new IndexWriter(runData.getDirectory(), iwc);
|
IndexWriter writer = new IndexWriter(runData.getDirectory(), iwc);
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.text.NumberFormat;
|
import java.text.NumberFormat;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
|
@ -487,7 +488,7 @@ public class TaskSequence extends PerfTask {
|
||||||
if (rate>0) {
|
if (rate>0) {
|
||||||
seqName += "_" + rate + (perMin?"/min":"/sec");
|
seqName += "_" + rate + (perMin?"/min":"/sec");
|
||||||
}
|
}
|
||||||
if (parallel && seqName.toLowerCase().indexOf("par")<0) {
|
if (parallel && seqName.toLowerCase(Locale.ROOT).indexOf("par")<0) {
|
||||||
seqName += "_Par";
|
seqName += "_Par";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.io.StringReader;
|
||||||
import java.lang.reflect.Constructor;
|
import java.lang.reflect.Constructor;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
|
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
|
||||||
|
@ -159,7 +160,7 @@ public class Algorithm {
|
||||||
} else {
|
} else {
|
||||||
stok.nextToken();
|
stok.nextToken();
|
||||||
if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
|
if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString());
|
||||||
String unit = stok.sval.toLowerCase();
|
String unit = stok.sval.toLowerCase(Locale.ROOT);
|
||||||
if ("min".equals(unit)) {
|
if ("min".equals(unit)) {
|
||||||
((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min
|
((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min
|
||||||
} else if ("sec".equals(unit)) {
|
} else if ("sec".equals(unit)) {
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
@ -51,7 +52,7 @@ public class QueryDriver {
|
||||||
|
|
||||||
File topicsFile = new File(args[0]);
|
File topicsFile = new File(args[0]);
|
||||||
File qrelsFile = new File(args[1]);
|
File qrelsFile = new File(args[1]);
|
||||||
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2]), "lucene");
|
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2], "UTF-8"), "lucene");
|
||||||
FSDirectory dir = FSDirectory.open(new File(args[3]));
|
FSDirectory dir = FSDirectory.open(new File(args[3]));
|
||||||
String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
|
String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
|
||||||
IndexReader reader = DirectoryReader.open(dir);
|
IndexReader reader = DirectoryReader.open(dir);
|
||||||
|
@ -64,10 +65,10 @@ public class QueryDriver {
|
||||||
|
|
||||||
// use trec utilities to read trec topics into quality queries
|
// use trec utilities to read trec topics into quality queries
|
||||||
TrecTopicsReader qReader = new TrecTopicsReader();
|
TrecTopicsReader qReader = new TrecTopicsReader();
|
||||||
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile)));
|
QualityQuery qqs[] = qReader.readQueries(new BufferedReader(IOUtils.getDecodingReader(topicsFile, IOUtils.CHARSET_UTF_8)));
|
||||||
|
|
||||||
// prepare judge, with trec utilities that read from a QRels file
|
// prepare judge, with trec utilities that read from a QRels file
|
||||||
Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile)));
|
Judge judge = new TrecJudge(new BufferedReader(IOUtils.getDecodingReader(qrelsFile, IOUtils.CHARSET_UTF_8)));
|
||||||
|
|
||||||
// validate topics & judgments match each other
|
// validate topics & judgments match each other
|
||||||
judge.validateData(qqs, logger);
|
judge.validateData(qqs, logger);
|
||||||
|
|
|
@ -19,12 +19,18 @@ package org.apache.lucene.benchmark.utils;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileFilter;
|
import java.io.FileFilter;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
|
* Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
|
||||||
|
@ -73,7 +79,7 @@ public class ExtractReuters {
|
||||||
*/
|
*/
|
||||||
protected void extractFile(File sgmFile) {
|
protected void extractFile(File sgmFile) {
|
||||||
try {
|
try {
|
||||||
BufferedReader reader = new BufferedReader(new FileReader(sgmFile));
|
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(sgmFile), IOUtils.CHARSET_UTF_8));
|
||||||
|
|
||||||
StringBuilder buffer = new StringBuilder(1024);
|
StringBuilder buffer = new StringBuilder(1024);
|
||||||
StringBuilder outBuffer = new StringBuilder(1024);
|
StringBuilder outBuffer = new StringBuilder(1024);
|
||||||
|
@ -107,7 +113,7 @@ public class ExtractReuters {
|
||||||
File outFile = new File(outputDir, sgmFile.getName() + "-"
|
File outFile = new File(outputDir, sgmFile.getName() + "-"
|
||||||
+ (docNumber++) + ".txt");
|
+ (docNumber++) + ".txt");
|
||||||
// System.out.println("Writing " + outFile);
|
// System.out.println("Writing " + outFile);
|
||||||
FileWriter writer = new FileWriter(outFile);
|
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outFile), IOUtils.CHARSET_UTF_8);
|
||||||
writer.write(out);
|
writer.write(out);
|
||||||
writer.close();
|
writer.close();
|
||||||
outBuffer.setLength(0);
|
outBuffer.setLength(0);
|
||||||
|
|
|
@ -219,10 +219,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
||||||
|
|
||||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||||
CheckIndex checker = new CheckIndex(dir);
|
CheckIndex checker = new CheckIndex(dir);
|
||||||
checker.setInfoStream(new PrintStream(bos));
|
checker.setInfoStream(new PrintStream(bos, false, "UTF-8"));
|
||||||
CheckIndex.Status indexStatus = checker.checkIndex();
|
CheckIndex.Status indexStatus = checker.checkIndex();
|
||||||
assertFalse(indexStatus.clean);
|
assertFalse(indexStatus.clean);
|
||||||
assertTrue(bos.toString().contains(IndexFormatTooOldException.class.getName()));
|
assertTrue(bos.toString("UTF-8").contains(IndexFormatTooOldException.class.getName()));
|
||||||
|
|
||||||
dir.close();
|
dir.close();
|
||||||
_TestUtil.rmDir(oldIndxeDir);
|
_TestUtil.rmDir(oldIndxeDir);
|
||||||
|
|
Loading…
Reference in New Issue