From 67d2e87fee0857b9958ae7dd949268b65e82b1de Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 15 Jul 2010 13:50:48 +0000 Subject: [PATCH] SOLR-2003: report (throw exception) rather than replace charset errors in SolrResourceLoader.getLines git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@964430 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 3 +++ .../apache/solr/core/SolrResourceLoader.java | 5 ++++- .../apache/solr/core/ResourceLoaderTest.java | 11 +++++++++++ .../solr/conf/stopwordsWrongEncoding.txt | 18 ++++++++++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 solr/src/test/test-files/solr/conf/stopwordsWrongEncoding.txt diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 2cc636a4dfe..b97e6c8631f 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -467,6 +467,9 @@ Other Changes * SOLR-1946: Misc improvements to the SystemInfoHandler: /admin/system (hossman) +* SOLR-2003: SolrResourceLoader will report any encoding errors, rather than + silently using replacement characters for invalid inputs (blargy via rmuir) + Build ---------------------- diff --git a/solr/src/java/org/apache/solr/core/SolrResourceLoader.java b/solr/src/java/org/apache/solr/core/SolrResourceLoader.java index 669b26a1b6b..e8f037d7935 100644 --- a/solr/src/java/org/apache/solr/core/SolrResourceLoader.java +++ b/solr/src/java/org/apache/solr/core/SolrResourceLoader.java @@ -33,6 +33,7 @@ import java.util.concurrent.ConcurrentHashMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.charset.Charset; +import java.nio.charset.CodingErrorAction; import java.lang.reflect.Constructor; import javax.naming.Context; @@ -316,7 +317,9 @@ public class SolrResourceLoader implements ResourceLoader ArrayList lines; try { input = new BufferedReader(new InputStreamReader(openResource(resource), - charset)); + charset.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT))); lines = new ArrayList(); for (String word=null; (word=input.readLine())!=null;) { diff --git a/solr/src/test/org/apache/solr/core/ResourceLoaderTest.java b/solr/src/test/org/apache/solr/core/ResourceLoaderTest.java index 8c77ca2143d..41364613bcc 100644 --- a/solr/src/test/org/apache/solr/core/ResourceLoaderTest.java +++ b/solr/src/test/org/apache/solr/core/ResourceLoaderTest.java @@ -31,6 +31,7 @@ import org.apache.solr.util.plugin.SolrCoreAware; import java.io.File; import java.io.InputStream; +import java.nio.charset.MalformedInputException; import java.util.Arrays; import java.util.List; @@ -117,4 +118,14 @@ public class ResourceLoaderTest extends TestCase assertEquals(1, lines.size()); assertEquals("BOMsAreEvil", lines.get(0)); } + + public void testWrongEncoding() throws Exception { + String wrongEncoding = "stopwordsWrongEncoding.txt"; + SolrResourceLoader loader = new SolrResourceLoader(null); + // ensure we get our exception + try { + List lines = loader.getLines(wrongEncoding); + fail(); + } catch (MalformedInputException expected) {} + } } diff --git a/solr/src/test/test-files/solr/conf/stopwordsWrongEncoding.txt b/solr/src/test/test-files/solr/conf/stopwordsWrongEncoding.txt new file mode 100644 index 00000000000..0d305c88c59 --- /dev/null +++ b/solr/src/test/test-files/solr/conf/stopwordsWrongEncoding.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# stopwords in the wrong encoding (ISO-8859-1). +# tests resourceloader's ability to report wrongly encoded files. +baņadores