LUCENE-4257: factor the getLines in REsourceLoader in WordListLoader

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1365992 13f79535-47bb-0310-9956-ffa450edef68
2012-07-26 13:43:28 +00:00 · 2012-07-26 13:43:28 +00:00 · 8bfa167a73
parent 5abc76ea42
commit 8bfa167a73
15 changed files with 67 additions and 186 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java
@ -57,12 +57,12 @@ public class MappingCharFilterFactory extends CharFilterFactory implements
      List<String> wlist = null;
      File mappingFile = new File(mapping);
      if (mappingFile.exists()) {
-        wlist = loader.getLines(mapping);
+        wlist = getLines(loader, mapping);
      } else {
        List<String> files = splitFileNames(mapping);
        wlist = new ArrayList<String>();
        for (String file : files) {
-          List<String> lines = loader.getLines(file.trim());
+          List<String> lines = getLines(loader, file.trim());
          wlist.addAll(lines);
        }
      }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java
@ -51,7 +51,7 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
      if (files.size() > 0) {
        stopTypes = new HashSet<String>();
        for (String file : files) {
-          List<String> typesLines = loader.getLines(file.trim());
+          List<String> typesLines = getLines(loader, file.trim());
          stopTypes.addAll(typesLines);
        }
      }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java
@ -49,7 +49,7 @@ public class StemmerOverrideFilterFactory extends TokenFilterFactory implements
        dictionary = new CharArrayMap<String>(luceneMatchVersion, 
            files.size() * 10, ignoreCase);
        for (String file : files) {
-          List<String> list = loader.getLines(file.trim());
+          List<String> list = getLines(loader, file.trim());
          for (String line : list) {
            String[] mapping = line.split("\t", 2);
            dictionary.put(mapping[0], mapping[1]);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
@ -63,7 +63,7 @@ public class WordDelimiterFilterFactory extends TokenFilterFactory implements Re
      List<String> files = splitFileNames( types );
      List<String> wlist = new ArrayList<String>();
      for( String file : files ){
-        List<String> lines = loader.getLines( file.trim() );
+        List<String> lines = getLines(loader, file.trim());
        wlist.addAll( lines );
      }
      typeTable = parseTypes(wlist);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java
@ -129,13 +129,17 @@ public abstract class AbstractAnalysisFactory {
      words = new CharArraySet(luceneMatchVersion,
          files.size() * 10, ignoreCase);
      for (String file : files) {
-        List<String> wlist = loader.getLines(file.trim());
+        List<String> wlist = getLines(loader, file.trim());
        words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
            ignoreCase));
      }
    }
    return words;
  }
+  
+  protected List<String> getLines(ResourceLoader loader, String resource) throws IOException {
+    return WordlistLoader.getLines(loader.openResource(resource), IOUtils.CHARSET_UTF_8);
+  }

  /** same as {@link #getWordSet(ResourceLoader, String, boolean)},
   * except the input is in snowball format. */
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ResourceAsStreamResourceLoader.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ResourceAsStreamResourceLoader.java
@ -17,17 +17,13 @@ package org.apache.lucene.analysis.util;
 * limitations under the License.
 */

-import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.CharacterCodingException;
-import java.nio.charset.CodingErrorAction;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.lucene.util.IOUtils;

+/**
+ * Simple ResourceLoader that uses Class.getResourceAsStream
+ * and Class.forName to open resources and classes, respectively.
+ */
 public class ResourceAsStreamResourceLoader implements ResourceLoader {
  Class<?> clazz;
  
@ -40,37 +36,6 @@ public class ResourceAsStreamResourceLoader implements ResourceLoader {
    return clazz.getResourceAsStream(resource);
  }

-  @Override
-  public List<String> getLines(String resource) throws IOException {
-    BufferedReader input = null;
-    ArrayList<String> lines;
-    try {
-      input = new BufferedReader(new InputStreamReader(openResource(resource),
-          IOUtils.CHARSET_UTF_8.newDecoder()
-          .onMalformedInput(CodingErrorAction.REPORT)
-          .onUnmappableCharacter(CodingErrorAction.REPORT)));
-
-      lines = new ArrayList<String>();
-      for (String word=null; (word=input.readLine())!=null;) {
-        // skip initial bom marker
-        if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
-          word = word.substring(1);
-        // skip comments
-        if (word.startsWith("#")) continue;
-        word=word.trim();
-        // skip blank lines
-        if (word.length()==0) continue;
-        lines.add(word);
-      }
-    } catch (CharacterCodingException ex) {
-      throw new RuntimeException("Error loading resource (wrong encoding?): " + resource, ex);
-    } finally {
-      if (input != null)
-        input.close();
-    }
-    return lines;
-  }
-
  // TODO: do this subpackages thing... wtf is that?
  @Override
  public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ResourceLoader.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ResourceLoader.java
@ -19,29 +19,19 @@ package org.apache.lucene.analysis.util;

 import java.io.IOException;
 import java.io.InputStream;
-import java.util.List;

 /**
 * Abstraction for loading resources (streams, files, and classes).
 */
 public interface ResourceLoader {

+  /**
+   * Opens a named resource
+   */
  public InputStream openResource(String resource) throws IOException;
  
  /**
-   * Accesses a resource by name and returns the (non comment) lines
-   * containing data.
-   *
-   * <p>
-   * A comment line is any line that starts with the character "#"
-   * </p>
-   *
-   * @param resource
-   * @return a list of non-blank non-comment lines with whitespace trimmed
-   * from front and back.
-   * @throws IOException
+   * Creates a class of the name and expected type
   */
-  public List<String> getLines(String resource) throws IOException;
-  
  public <T> T newInstance(String cname, Class<T> expectedType, String ... subpackages);
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
@ -19,7 +19,11 @@ package org.apache.lucene.analysis.util;

 import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;

 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
@ -194,6 +198,47 @@ public class WordlistLoader {
    return result;
  }
  
+  /**
+   * Accesses a resource by name and returns the (non comment) lines containing
+   * data using the given character encoding.
+   *
+   * <p>
+   * A comment line is any line that starts with the character "#"
+   * </p>
+   *
+   * @return a list of non-blank non-comment lines with whitespace trimmed
+   * @throws IOException
+   */
+  public static List<String> getLines(InputStream stream, Charset charset) throws IOException{
+    BufferedReader input = null;
+    ArrayList<String> lines;
+    boolean success = false;
+    try {
+      input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
+
+      lines = new ArrayList<String>();
+      for (String word=null; (word=input.readLine())!=null;) {
+        // skip initial bom marker
+        if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
+          word = word.substring(1);
+        // skip comments
+        if (word.startsWith("#")) continue;
+        word=word.trim();
+        // skip blank lines
+        if (word.length()==0) continue;
+        lines.add(word);
+      }
+      success = true;
+      return lines;
+    } finally {
+      if (success) {
+        IOUtils.close(input);
+      } else {
+        IOUtils.closeWhileHandlingException(input);
+      }
+    }
+  }
+  
  private static BufferedReader getBufferedReader(Reader reader) {
    return (reader instanceof BufferedReader) ? (BufferedReader) reader
        : new BufferedReader(reader);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.util;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;

 /** Fake resource loader for tests: works if you want to fake reading a single file */
 public class StringMockResourceLoader implements ResourceLoader {
@ -31,10 +29,6 @@ public class StringMockResourceLoader implements ResourceLoader {
    this.text = text;
  }

-  public List<String> getLines(String resource) throws IOException {
-    return Arrays.asList(text.split("\n"));
-  }
-
  // TODO: do this subpackages thing... wtf is that?
  public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
    try {
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java
@ -33,10 +33,6 @@ class StringMockResourceLoader implements ResourceLoader {
    this.text = text;
  }

-  public List<String> getLines(String resource) throws IOException {
-    return Arrays.asList(text.split("\n"));
-  }
-
  // TODO: do this subpackages thing... wtf is that?
  public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
    try {
--- a/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/ResourceAsStreamResourceLoader.java
+++ b/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/ResourceAsStreamResourceLoader.java
@ -1,85 +0,0 @@
-package org.apache.lucene.analysis.stempel;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.CharacterCodingException;
-import java.nio.charset.CodingErrorAction;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.lucene.util.IOUtils;
-
-public class ResourceAsStreamResourceLoader implements ResourceLoader {
-  Class<?> clazz;
-  
-  public ResourceAsStreamResourceLoader(Class<?> clazz) {
-    this.clazz = clazz;
-  }
-
-  @Override
-  public InputStream openResource(String resource) throws IOException {
-    return clazz.getResourceAsStream(resource);
-  }
-
-  @Override
-  public List<String> getLines(String resource) throws IOException {
-    BufferedReader input = null;
-    ArrayList<String> lines;
-    try {
-      input = new BufferedReader(new InputStreamReader(openResource(resource),
-          IOUtils.CHARSET_UTF_8.newDecoder()
-          .onMalformedInput(CodingErrorAction.REPORT)
-          .onUnmappableCharacter(CodingErrorAction.REPORT)));
-
-      lines = new ArrayList<String>();
-      for (String word=null; (word=input.readLine())!=null;) {
-        // skip initial bom marker
-        if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
-          word = word.substring(1);
-        // skip comments
-        if (word.startsWith("#")) continue;
-        word=word.trim();
-        // skip blank lines
-        if (word.length()==0) continue;
-        lines.add(word);
-      }
-    } catch (CharacterCodingException ex) {
-      throw new RuntimeException("Error loading resource (wrong encoding?): " + resource, ex);
-    } finally {
-      if (input != null)
-        input.close();
-    }
-    return lines;
-  }
-
-  // TODO: do this subpackages thing... wtf is that?
-  @Override
-  public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
-    try {
-      Class<? extends T> clazz = Class.forName(cname).asSubclass(expectedType);
-      return clazz.newInstance();
-    } catch (Exception e) {
-      throw new RuntimeException(e);
-    }
-  }
-}
--- a/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java
+++ b/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java
@ -22,6 +22,7 @@ import java.io.StringReader;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader;

 /**
 * Tests for {@link StempelPolishStemFilterFactory}
--- a/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java
@ -37,6 +37,7 @@ import org.apache.lucene.analysis.util.ResourceLoaderAware;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.TokenizerFactory;
 import org.apache.lucene.analysis.util.AnalysisSPILoader;
+import org.apache.lucene.analysis.util.WordlistLoader;
 import org.apache.lucene.util.WeakIdentityMap;
 import org.apache.solr.common.ResourceLoader;
 import org.apache.solr.handler.admin.CoreAdminHandler;
@ -337,34 +338,12 @@ public class SolrResourceLoader implements ResourceLoader


  public List<String> getLines(String resource, Charset charset) throws IOException{
-    BufferedReader input = null;
-    ArrayList<String> lines;
    try {
-      input = new BufferedReader(new InputStreamReader(openResource(resource),
-          charset.newDecoder()
-          .onMalformedInput(CodingErrorAction.REPORT)
-          .onUnmappableCharacter(CodingErrorAction.REPORT)));
-
-      lines = new ArrayList<String>();
-      for (String word=null; (word=input.readLine())!=null;) {
-        // skip initial bom marker
-        if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
-          word = word.substring(1);
-        // skip comments
-        if (word.startsWith("#")) continue;
-        word=word.trim();
-        // skip blank lines
-        if (word.length()==0) continue;
-        lines.add(word);
-      }
+      return WordlistLoader.getLines(openResource(resource), charset);
    } catch (CharacterCodingException ex) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, 
-          "Error loading resource (wrong encoding?): " + resource, ex);
-    } finally {
-      if (input != null)
-        input.close();
+         "Error loading resource (wrong encoding?): " + resource, ex);
    }
-    return lines;
  }

  /*
--- a/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java
@ -55,10 +55,6 @@ public class TestMultiWordSynonyms extends BaseTokenStreamTestCase {
      this.text = text;
    }

-    public List<String> getLines(String resource) throws IOException {
-      return null;
-    }
-
    public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
      return null;
    }
--- a/solr/test-framework/src/java/org/apache/solr/analysis/StringMockSolrResourceLoader.java
+++ b/solr/test-framework/src/java/org/apache/solr/analysis/StringMockSolrResourceLoader.java
@ -32,10 +32,6 @@ class StringMockSolrResourceLoader implements ResourceLoader {
    this.text = text;
  }

-  public List<String> getLines(String resource) throws IOException {
-    return Arrays.asList(text.split("\n"));
-  }
-
  public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
    return null;
  }