mirror of https://github.com/apache/lucene.git
LUCENE-4257: factor the getLines in REsourceLoader in WordListLoader
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1365992 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5abc76ea42
commit
8bfa167a73
|
@ -57,12 +57,12 @@ public class MappingCharFilterFactory extends CharFilterFactory implements
|
|||
List<String> wlist = null;
|
||||
File mappingFile = new File(mapping);
|
||||
if (mappingFile.exists()) {
|
||||
wlist = loader.getLines(mapping);
|
||||
wlist = getLines(loader, mapping);
|
||||
} else {
|
||||
List<String> files = splitFileNames(mapping);
|
||||
wlist = new ArrayList<String>();
|
||||
for (String file : files) {
|
||||
List<String> lines = loader.getLines(file.trim());
|
||||
List<String> lines = getLines(loader, file.trim());
|
||||
wlist.addAll(lines);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,7 +51,7 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
|
|||
if (files.size() > 0) {
|
||||
stopTypes = new HashSet<String>();
|
||||
for (String file : files) {
|
||||
List<String> typesLines = loader.getLines(file.trim());
|
||||
List<String> typesLines = getLines(loader, file.trim());
|
||||
stopTypes.addAll(typesLines);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,7 +49,7 @@ public class StemmerOverrideFilterFactory extends TokenFilterFactory implements
|
|||
dictionary = new CharArrayMap<String>(luceneMatchVersion,
|
||||
files.size() * 10, ignoreCase);
|
||||
for (String file : files) {
|
||||
List<String> list = loader.getLines(file.trim());
|
||||
List<String> list = getLines(loader, file.trim());
|
||||
for (String line : list) {
|
||||
String[] mapping = line.split("\t", 2);
|
||||
dictionary.put(mapping[0], mapping[1]);
|
||||
|
|
|
@ -63,7 +63,7 @@ public class WordDelimiterFilterFactory extends TokenFilterFactory implements Re
|
|||
List<String> files = splitFileNames( types );
|
||||
List<String> wlist = new ArrayList<String>();
|
||||
for( String file : files ){
|
||||
List<String> lines = loader.getLines( file.trim() );
|
||||
List<String> lines = getLines(loader, file.trim());
|
||||
wlist.addAll( lines );
|
||||
}
|
||||
typeTable = parseTypes(wlist);
|
||||
|
|
|
@ -129,13 +129,17 @@ public abstract class AbstractAnalysisFactory {
|
|||
words = new CharArraySet(luceneMatchVersion,
|
||||
files.size() * 10, ignoreCase);
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
List<String> wlist = getLines(loader, file.trim());
|
||||
words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
|
||||
ignoreCase));
|
||||
}
|
||||
}
|
||||
return words;
|
||||
}
|
||||
|
||||
protected List<String> getLines(ResourceLoader loader, String resource) throws IOException {
|
||||
return WordlistLoader.getLines(loader.openResource(resource), IOUtils.CHARSET_UTF_8);
|
||||
}
|
||||
|
||||
/** same as {@link #getWordSet(ResourceLoader, String, boolean)},
|
||||
* except the input is in snowball format. */
|
||||
|
|
|
@ -17,17 +17,13 @@ package org.apache.lucene.analysis.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Simple ResourceLoader that uses Class.getResourceAsStream
|
||||
* and Class.forName to open resources and classes, respectively.
|
||||
*/
|
||||
public class ResourceAsStreamResourceLoader implements ResourceLoader {
|
||||
Class<?> clazz;
|
||||
|
||||
|
@ -40,37 +36,6 @@ public class ResourceAsStreamResourceLoader implements ResourceLoader {
|
|||
return clazz.getResourceAsStream(resource);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
BufferedReader input = null;
|
||||
ArrayList<String> lines;
|
||||
try {
|
||||
input = new BufferedReader(new InputStreamReader(openResource(resource),
|
||||
IOUtils.CHARSET_UTF_8.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT)));
|
||||
|
||||
lines = new ArrayList<String>();
|
||||
for (String word=null; (word=input.readLine())!=null;) {
|
||||
// skip initial bom marker
|
||||
if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
|
||||
word = word.substring(1);
|
||||
// skip comments
|
||||
if (word.startsWith("#")) continue;
|
||||
word=word.trim();
|
||||
// skip blank lines
|
||||
if (word.length()==0) continue;
|
||||
lines.add(word);
|
||||
}
|
||||
} catch (CharacterCodingException ex) {
|
||||
throw new RuntimeException("Error loading resource (wrong encoding?): " + resource, ex);
|
||||
} finally {
|
||||
if (input != null)
|
||||
input.close();
|
||||
}
|
||||
return lines;
|
||||
}
|
||||
|
||||
// TODO: do this subpackages thing... wtf is that?
|
||||
@Override
|
||||
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
|
|
@ -19,29 +19,19 @@ package org.apache.lucene.analysis.util;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Abstraction for loading resources (streams, files, and classes).
|
||||
*/
|
||||
public interface ResourceLoader {
|
||||
|
||||
/**
|
||||
* Opens a named resource
|
||||
*/
|
||||
public InputStream openResource(String resource) throws IOException;
|
||||
|
||||
/**
|
||||
* Accesses a resource by name and returns the (non comment) lines
|
||||
* containing data.
|
||||
*
|
||||
* <p>
|
||||
* A comment line is any line that starts with the character "#"
|
||||
* </p>
|
||||
*
|
||||
* @param resource
|
||||
* @return a list of non-blank non-comment lines with whitespace trimmed
|
||||
* from front and back.
|
||||
* @throws IOException
|
||||
* Creates a class of the name and expected type
|
||||
*/
|
||||
public List<String> getLines(String resource) throws IOException;
|
||||
|
||||
public <T> T newInstance(String cname, Class<T> expectedType, String ... subpackages);
|
||||
}
|
|
@ -19,7 +19,11 @@ package org.apache.lucene.analysis.util;
|
|||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -194,6 +198,47 @@ public class WordlistLoader {
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accesses a resource by name and returns the (non comment) lines containing
|
||||
* data using the given character encoding.
|
||||
*
|
||||
* <p>
|
||||
* A comment line is any line that starts with the character "#"
|
||||
* </p>
|
||||
*
|
||||
* @return a list of non-blank non-comment lines with whitespace trimmed
|
||||
* @throws IOException
|
||||
*/
|
||||
public static List<String> getLines(InputStream stream, Charset charset) throws IOException{
|
||||
BufferedReader input = null;
|
||||
ArrayList<String> lines;
|
||||
boolean success = false;
|
||||
try {
|
||||
input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
|
||||
|
||||
lines = new ArrayList<String>();
|
||||
for (String word=null; (word=input.readLine())!=null;) {
|
||||
// skip initial bom marker
|
||||
if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
|
||||
word = word.substring(1);
|
||||
// skip comments
|
||||
if (word.startsWith("#")) continue;
|
||||
word=word.trim();
|
||||
// skip blank lines
|
||||
if (word.length()==0) continue;
|
||||
lines.add(word);
|
||||
}
|
||||
success = true;
|
||||
return lines;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(input);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(input);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static BufferedReader getBufferedReader(Reader reader) {
|
||||
return (reader instanceof BufferedReader) ? (BufferedReader) reader
|
||||
: new BufferedReader(reader);
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.util;
|
|||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/** Fake resource loader for tests: works if you want to fake reading a single file */
|
||||
public class StringMockResourceLoader implements ResourceLoader {
|
||||
|
@ -31,10 +29,6 @@ public class StringMockResourceLoader implements ResourceLoader {
|
|||
this.text = text;
|
||||
}
|
||||
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return Arrays.asList(text.split("\n"));
|
||||
}
|
||||
|
||||
// TODO: do this subpackages thing... wtf is that?
|
||||
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
|
||||
try {
|
||||
|
|
|
@ -33,10 +33,6 @@ class StringMockResourceLoader implements ResourceLoader {
|
|||
this.text = text;
|
||||
}
|
||||
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return Arrays.asList(text.split("\n"));
|
||||
}
|
||||
|
||||
// TODO: do this subpackages thing... wtf is that?
|
||||
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
|
||||
try {
|
||||
|
|
|
@ -1,85 +0,0 @@
|
|||
package org.apache.lucene.analysis.stempel;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
public class ResourceAsStreamResourceLoader implements ResourceLoader {
|
||||
Class<?> clazz;
|
||||
|
||||
public ResourceAsStreamResourceLoader(Class<?> clazz) {
|
||||
this.clazz = clazz;
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream openResource(String resource) throws IOException {
|
||||
return clazz.getResourceAsStream(resource);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
BufferedReader input = null;
|
||||
ArrayList<String> lines;
|
||||
try {
|
||||
input = new BufferedReader(new InputStreamReader(openResource(resource),
|
||||
IOUtils.CHARSET_UTF_8.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT)));
|
||||
|
||||
lines = new ArrayList<String>();
|
||||
for (String word=null; (word=input.readLine())!=null;) {
|
||||
// skip initial bom marker
|
||||
if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
|
||||
word = word.substring(1);
|
||||
// skip comments
|
||||
if (word.startsWith("#")) continue;
|
||||
word=word.trim();
|
||||
// skip blank lines
|
||||
if (word.length()==0) continue;
|
||||
lines.add(word);
|
||||
}
|
||||
} catch (CharacterCodingException ex) {
|
||||
throw new RuntimeException("Error loading resource (wrong encoding?): " + resource, ex);
|
||||
} finally {
|
||||
if (input != null)
|
||||
input.close();
|
||||
}
|
||||
return lines;
|
||||
}
|
||||
|
||||
// TODO: do this subpackages thing... wtf is that?
|
||||
@Override
|
||||
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
|
||||
try {
|
||||
Class<? extends T> clazz = Class.forName(cname).asSubclass(expectedType);
|
||||
return clazz.newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -22,6 +22,7 @@ import java.io.StringReader;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader;
|
||||
|
||||
/**
|
||||
* Tests for {@link StempelPolishStemFilterFactory}
|
||||
|
|
|
@ -37,6 +37,7 @@ import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
|||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.analysis.util.AnalysisSPILoader;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.WeakIdentityMap;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.handler.admin.CoreAdminHandler;
|
||||
|
@ -337,34 +338,12 @@ public class SolrResourceLoader implements ResourceLoader
|
|||
|
||||
|
||||
public List<String> getLines(String resource, Charset charset) throws IOException{
|
||||
BufferedReader input = null;
|
||||
ArrayList<String> lines;
|
||||
try {
|
||||
input = new BufferedReader(new InputStreamReader(openResource(resource),
|
||||
charset.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT)));
|
||||
|
||||
lines = new ArrayList<String>();
|
||||
for (String word=null; (word=input.readLine())!=null;) {
|
||||
// skip initial bom marker
|
||||
if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
|
||||
word = word.substring(1);
|
||||
// skip comments
|
||||
if (word.startsWith("#")) continue;
|
||||
word=word.trim();
|
||||
// skip blank lines
|
||||
if (word.length()==0) continue;
|
||||
lines.add(word);
|
||||
}
|
||||
return WordlistLoader.getLines(openResource(resource), charset);
|
||||
} catch (CharacterCodingException ex) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
|
||||
"Error loading resource (wrong encoding?): " + resource, ex);
|
||||
} finally {
|
||||
if (input != null)
|
||||
input.close();
|
||||
"Error loading resource (wrong encoding?): " + resource, ex);
|
||||
}
|
||||
return lines;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -55,10 +55,6 @@ public class TestMultiWordSynonyms extends BaseTokenStreamTestCase {
|
|||
this.text = text;
|
||||
}
|
||||
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -32,10 +32,6 @@ class StringMockSolrResourceLoader implements ResourceLoader {
|
|||
this.text = text;
|
||||
}
|
||||
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return Arrays.asList(text.split("\n"));
|
||||
}
|
||||
|
||||
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
|
||||
return null;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue