LUCENE-4257: factor the getLines in REsourceLoader in WordListLoader

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1365992 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-07-26 13:43:28 +00:00
parent 5abc76ea42
commit 8bfa167a73
15 changed files with 67 additions and 186 deletions

View File

@ -57,12 +57,12 @@ public class MappingCharFilterFactory extends CharFilterFactory implements
List<String> wlist = null;
File mappingFile = new File(mapping);
if (mappingFile.exists()) {
wlist = loader.getLines(mapping);
wlist = getLines(loader, mapping);
} else {
List<String> files = splitFileNames(mapping);
wlist = new ArrayList<String>();
for (String file : files) {
List<String> lines = loader.getLines(file.trim());
List<String> lines = getLines(loader, file.trim());
wlist.addAll(lines);
}
}

View File

@ -51,7 +51,7 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
if (files.size() > 0) {
stopTypes = new HashSet<String>();
for (String file : files) {
List<String> typesLines = loader.getLines(file.trim());
List<String> typesLines = getLines(loader, file.trim());
stopTypes.addAll(typesLines);
}
}

View File

@ -49,7 +49,7 @@ public class StemmerOverrideFilterFactory extends TokenFilterFactory implements
dictionary = new CharArrayMap<String>(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) {
List<String> list = loader.getLines(file.trim());
List<String> list = getLines(loader, file.trim());
for (String line : list) {
String[] mapping = line.split("\t", 2);
dictionary.put(mapping[0], mapping[1]);

View File

@ -63,7 +63,7 @@ public class WordDelimiterFilterFactory extends TokenFilterFactory implements Re
List<String> files = splitFileNames( types );
List<String> wlist = new ArrayList<String>();
for( String file : files ){
List<String> lines = loader.getLines( file.trim() );
List<String> lines = getLines(loader, file.trim());
wlist.addAll( lines );
}
typeTable = parseTypes(wlist);

View File

@ -129,13 +129,17 @@ public abstract class AbstractAnalysisFactory {
words = new CharArraySet(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
List<String> wlist = getLines(loader, file.trim());
words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
ignoreCase));
}
}
return words;
}
protected List<String> getLines(ResourceLoader loader, String resource) throws IOException {
return WordlistLoader.getLines(loader.openResource(resource), IOUtils.CHARSET_UTF_8);
}
/** same as {@link #getWordSet(ResourceLoader, String, boolean)},
* except the input is in snowball format. */

View File

@ -17,17 +17,13 @@ package org.apache.lucene.analysis.util;
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.util.IOUtils;
/**
* Simple ResourceLoader that uses Class.getResourceAsStream
* and Class.forName to open resources and classes, respectively.
*/
public class ResourceAsStreamResourceLoader implements ResourceLoader {
Class<?> clazz;
@ -40,37 +36,6 @@ public class ResourceAsStreamResourceLoader implements ResourceLoader {
return clazz.getResourceAsStream(resource);
}
@Override
public List<String> getLines(String resource) throws IOException {
BufferedReader input = null;
ArrayList<String> lines;
try {
input = new BufferedReader(new InputStreamReader(openResource(resource),
IOUtils.CHARSET_UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT)));
lines = new ArrayList<String>();
for (String word=null; (word=input.readLine())!=null;) {
// skip initial bom marker
if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
word = word.substring(1);
// skip comments
if (word.startsWith("#")) continue;
word=word.trim();
// skip blank lines
if (word.length()==0) continue;
lines.add(word);
}
} catch (CharacterCodingException ex) {
throw new RuntimeException("Error loading resource (wrong encoding?): " + resource, ex);
} finally {
if (input != null)
input.close();
}
return lines;
}
// TODO: do this subpackages thing... wtf is that?
@Override
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {

View File

@ -19,29 +19,19 @@ package org.apache.lucene.analysis.util;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
/**
* Abstraction for loading resources (streams, files, and classes).
*/
public interface ResourceLoader {
/**
* Opens a named resource
*/
public InputStream openResource(String resource) throws IOException;
/**
* Accesses a resource by name and returns the (non comment) lines
* containing data.
*
* <p>
* A comment line is any line that starts with the character "#"
* </p>
*
* @param resource
* @return a list of non-blank non-comment lines with whitespace trimmed
* from front and back.
* @throws IOException
* Creates a class of the name and expected type
*/
public List<String> getLines(String resource) throws IOException;
public <T> T newInstance(String cname, Class<T> expectedType, String ... subpackages);
}

View File

@ -19,7 +19,11 @@ package org.apache.lucene.analysis.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
@ -194,6 +198,47 @@ public class WordlistLoader {
return result;
}
/**
* Accesses a resource by name and returns the (non comment) lines containing
* data using the given character encoding.
*
* <p>
* A comment line is any line that starts with the character "#"
* </p>
*
* @return a list of non-blank non-comment lines with whitespace trimmed
* @throws IOException
*/
public static List<String> getLines(InputStream stream, Charset charset) throws IOException{
BufferedReader input = null;
ArrayList<String> lines;
boolean success = false;
try {
input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
lines = new ArrayList<String>();
for (String word=null; (word=input.readLine())!=null;) {
// skip initial bom marker
if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
word = word.substring(1);
// skip comments
if (word.startsWith("#")) continue;
word=word.trim();
// skip blank lines
if (word.length()==0) continue;
lines.add(word);
}
success = true;
return lines;
} finally {
if (success) {
IOUtils.close(input);
} else {
IOUtils.closeWhileHandlingException(input);
}
}
}
private static BufferedReader getBufferedReader(Reader reader) {
return (reader instanceof BufferedReader) ? (BufferedReader) reader
: new BufferedReader(reader);

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.util;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
/** Fake resource loader for tests: works if you want to fake reading a single file */
public class StringMockResourceLoader implements ResourceLoader {
@ -31,10 +29,6 @@ public class StringMockResourceLoader implements ResourceLoader {
this.text = text;
}
public List<String> getLines(String resource) throws IOException {
return Arrays.asList(text.split("\n"));
}
// TODO: do this subpackages thing... wtf is that?
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
try {

View File

@ -33,10 +33,6 @@ class StringMockResourceLoader implements ResourceLoader {
this.text = text;
}
public List<String> getLines(String resource) throws IOException {
return Arrays.asList(text.split("\n"));
}
// TODO: do this subpackages thing... wtf is that?
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
try {

View File

@ -1,85 +0,0 @@
package org.apache.lucene.analysis.stempel;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.util.IOUtils;
public class ResourceAsStreamResourceLoader implements ResourceLoader {
Class<?> clazz;
public ResourceAsStreamResourceLoader(Class<?> clazz) {
this.clazz = clazz;
}
@Override
public InputStream openResource(String resource) throws IOException {
return clazz.getResourceAsStream(resource);
}
@Override
public List<String> getLines(String resource) throws IOException {
BufferedReader input = null;
ArrayList<String> lines;
try {
input = new BufferedReader(new InputStreamReader(openResource(resource),
IOUtils.CHARSET_UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT)));
lines = new ArrayList<String>();
for (String word=null; (word=input.readLine())!=null;) {
// skip initial bom marker
if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
word = word.substring(1);
// skip comments
if (word.startsWith("#")) continue;
word=word.trim();
// skip blank lines
if (word.length()==0) continue;
lines.add(word);
}
} catch (CharacterCodingException ex) {
throw new RuntimeException("Error loading resource (wrong encoding?): " + resource, ex);
} finally {
if (input != null)
input.close();
}
return lines;
}
// TODO: do this subpackages thing... wtf is that?
@Override
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
try {
Class<? extends T> clazz = Class.forName(cname).asSubclass(expectedType);
return clazz.newInstance();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}

View File

@ -22,6 +22,7 @@ import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader;
/**
* Tests for {@link StempelPolishStemFilterFactory}

View File

@ -37,6 +37,7 @@ import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.analysis.util.AnalysisSPILoader;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.WeakIdentityMap;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.handler.admin.CoreAdminHandler;
@ -337,34 +338,12 @@ public class SolrResourceLoader implements ResourceLoader
public List<String> getLines(String resource, Charset charset) throws IOException{
BufferedReader input = null;
ArrayList<String> lines;
try {
input = new BufferedReader(new InputStreamReader(openResource(resource),
charset.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT)));
lines = new ArrayList<String>();
for (String word=null; (word=input.readLine())!=null;) {
// skip initial bom marker
if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
word = word.substring(1);
// skip comments
if (word.startsWith("#")) continue;
word=word.trim();
// skip blank lines
if (word.length()==0) continue;
lines.add(word);
}
return WordlistLoader.getLines(openResource(resource), charset);
} catch (CharacterCodingException ex) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Error loading resource (wrong encoding?): " + resource, ex);
} finally {
if (input != null)
input.close();
"Error loading resource (wrong encoding?): " + resource, ex);
}
return lines;
}
/*

View File

@ -55,10 +55,6 @@ public class TestMultiWordSynonyms extends BaseTokenStreamTestCase {
this.text = text;
}
public List<String> getLines(String resource) throws IOException {
return null;
}
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
return null;
}

View File

@ -32,10 +32,6 @@ class StringMockSolrResourceLoader implements ResourceLoader {
this.text = text;
}
public List<String> getLines(String resource) throws IOException {
return Arrays.asList(text.split("\n"));
}
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
return null;
}