From f3c25f02d587fd8585a3f0d468195829a13592b6 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sat, 8 May 2010 00:42:04 +0000 Subject: [PATCH] SOLR-1865: ignore BOMs in SolrResourceLoader.getLines git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@942288 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 4 +++ .../apache/solr/core/SolrResourceLoader.java | 3 ++ .../apache/solr/core/ResourceLoaderTest.java | 31 ++++++++++++++++++- .../test/test-files/solr/conf/stopwithbom.txt | 1 + 4 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 solr/src/test/test-files/solr/conf/stopwithbom.txt diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 85b95218168..f93f162e148 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -347,6 +347,10 @@ Other Changes (Chris Male via rmuir) * SOLR-1851: luceneAutoCommit no longer has any effect - it has been remove (Mark Miller) + +* SOLR-1865: SolrResourceLoader.getLines ignores Byte Order Markers (BOMs) at the + beginning of input files, these are often created by editors such as Windows + Notepad. (rmuir, hossman) Build diff --git a/solr/src/java/org/apache/solr/core/SolrResourceLoader.java b/solr/src/java/org/apache/solr/core/SolrResourceLoader.java index 114def4977e..40b2b557728 100644 --- a/solr/src/java/org/apache/solr/core/SolrResourceLoader.java +++ b/solr/src/java/org/apache/solr/core/SolrResourceLoader.java @@ -309,6 +309,9 @@ public class SolrResourceLoader implements ResourceLoader lines = new ArrayList(); for (String word=null; (word=input.readLine())!=null;) { + // skip initial bom marker + if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF') + word = word.substring(1); // skip comments if (word.startsWith("#")) continue; word=word.trim(); diff --git a/solr/src/test/org/apache/solr/core/ResourceLoaderTest.java b/solr/src/test/org/apache/solr/core/ResourceLoaderTest.java index 5b386f0a585..8c77ca2143d 100644 --- a/solr/src/test/org/apache/solr/core/ResourceLoaderTest.java +++ b/solr/src/test/org/apache/solr/core/ResourceLoaderTest.java @@ -30,6 +30,9 @@ import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.SolrCoreAware; import java.io.File; +import java.io.InputStream; +import java.util.Arrays; +import java.util.List; public class ResourceLoaderTest extends TestCase { @@ -88,4 +91,30 @@ public class ResourceLoaderTest extends TestCase catch( SolrException ex ) { } // OK } } -} \ No newline at end of file + + public void testBOMMarkers() throws Exception { + final String fileWithBom = "stopwithbom.txt"; + SolrResourceLoader loader = new SolrResourceLoader(null); + + // preliminary sanity check + InputStream bomStream = loader.openResource(fileWithBom); + try { + final byte[] bomExpected = new byte[] { -17, -69, -65 }; + final byte[] firstBytes = new byte[3]; + + assertEquals("Should have been able to read 3 bytes from bomStream", + 3, bomStream.read(firstBytes)); + + assertTrue("This test only works if " + fileWithBom + + " contains a BOM -- it appears someone removed it.", + Arrays.equals(bomExpected, firstBytes)); + } finally { + try { bomStream.close(); } catch (Exception e) { /* IGNORE */ } + } + + // now make sure getLines skips the BOM... + List lines = loader.getLines(fileWithBom); + assertEquals(1, lines.size()); + assertEquals("BOMsAreEvil", lines.get(0)); + } +} diff --git a/solr/src/test/test-files/solr/conf/stopwithbom.txt b/solr/src/test/test-files/solr/conf/stopwithbom.txt new file mode 100644 index 00000000000..eb5f6e1c0f8 --- /dev/null +++ b/solr/src/test/test-files/solr/conf/stopwithbom.txt @@ -0,0 +1 @@ +BOMsAreEvil