mirror of https://github.com/apache/lucene.git
LUCENE-3884: Move ElisionFilter out of .fr package
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1367096 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e101167585
commit
bf73f1f28b
lucene
CHANGES.txt
analysis/common/src
java/org/apache/lucene/analysis
ca
fr
ga
it
util
resources/META-INF/services
test/org/apache/lucene/analysis/util
|
@ -69,6 +69,9 @@ API Changes
|
|||
|
||||
* LUCENE-3747: Support Unicode 6.1.0. (Steve Rowe)
|
||||
|
||||
* LUCENE-3884: Moved ElisionFilter out of org.apache.lucene.analysis.fr
|
||||
package into org.apache.lucene.analysis.util. (Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4171: Performance improvements to Packed64.
|
||||
|
|
|
@ -24,7 +24,6 @@ import java.util.Arrays;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.fr.ElisionFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -32,6 +31,7 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.CatalanStemmer;
|
||||
|
@ -127,7 +127,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -35,6 +36,7 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for French language.
|
||||
|
@ -54,6 +56,11 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
/** File containing default French stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
|
||||
|
||||
/** Default set of articles for ElisionFilter */
|
||||
public static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
|
||||
"l", "m", "t", "qu", "n", "s", "j"), true));
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
|
@ -134,7 +141,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new ElisionFilter(matchVersion, result);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!excltable.isEmpty())
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.util.Arrays;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.fr.ElisionFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -31,6 +30,7 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.IrishStemmer;
|
||||
|
@ -140,7 +140,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
|||
StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
|
||||
s.setEnablePositionIncrements(false);
|
||||
result = s;
|
||||
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new IrishLowerCaseFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -24,7 +24,6 @@ import java.util.Arrays;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.fr.ElisionFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -32,6 +31,7 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -129,7 +129,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.fr;
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -18,13 +18,11 @@ package org.apache.lucene.analysis.fr;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
|
||||
|
@ -33,31 +31,17 @@ import org.apache.lucene.util.Version;
|
|||
* @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
|
||||
*/
|
||||
public final class ElisionFilter extends TokenFilter {
|
||||
private CharArraySet articles = CharArraySet.EMPTY_SET;
|
||||
private final CharArraySet articles;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
|
||||
"l", "m", "t", "qu", "n", "s", "j"), true));
|
||||
|
||||
private static char[] apostrophes = {'\'', '\u2019'};
|
||||
|
||||
/**
|
||||
* Constructs an elision filter with standard stop words
|
||||
*/
|
||||
public ElisionFilter(Version matchVersion, TokenStream input) {
|
||||
this(matchVersion, input, DEFAULT_ARTICLES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an elision filter with a Set of stop words
|
||||
* @param matchVersion the lucene backwards compatibility version
|
||||
* @param input the source {@link TokenStream}
|
||||
* @param articles a set of stopword articles
|
||||
*/
|
||||
public ElisionFilter(Version matchVersion, TokenStream input, CharArraySet articles) {
|
||||
public ElisionFilter(TokenStream input, CharArraySet articles) {
|
||||
super(input);
|
||||
this.articles = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(matchVersion, articles, true));
|
||||
this.articles = articles;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -69,22 +53,18 @@ public final class ElisionFilter extends TokenFilter {
|
|||
char[] termBuffer = termAtt.buffer();
|
||||
int termLength = termAtt.length();
|
||||
|
||||
int minPoz = Integer.MAX_VALUE;
|
||||
for (int i = 0; i < apostrophes.length; i++) {
|
||||
char apos = apostrophes[i];
|
||||
// The equivalent of String.indexOf(ch)
|
||||
for (int poz = 0; poz < termLength ; poz++) {
|
||||
if (termBuffer[poz] == apos) {
|
||||
minPoz = Math.min(poz, minPoz);
|
||||
break;
|
||||
}
|
||||
int index = -1;
|
||||
for (int i = 0; i < termLength; i++) {
|
||||
char ch = termBuffer[i];
|
||||
if (ch == '\'' || ch == '\u2019') {
|
||||
index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// An apostrophe has been found. If the prefix is an article strip it off.
|
||||
if (minPoz != Integer.MAX_VALUE
|
||||
&& articles.contains(termAtt.buffer(), 0, minPoz)) {
|
||||
termAtt.copyBuffer(termAtt.buffer(), minPoz + 1, termAtt.length() - (minPoz + 1));
|
||||
if (index >= 0 && articles.contains(termBuffer, 0, index)) {
|
||||
termAtt.copyBuffer(termBuffer, index + 1, termLength - (index + 1));
|
||||
}
|
||||
|
||||
return true;
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.fr;
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -17,10 +17,9 @@ package org.apache.lucene.analysis.fr;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.util.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
|
||||
|
||||
/**
|
||||
* Factory for {@link ElisionFilter}.
|
||||
|
@ -46,12 +45,13 @@ public class ElisionFilterFactory extends TokenFilterFactory implements Resource
|
|||
if (articlesFile != null) {
|
||||
articles = getWordSet(loader, articlesFile, ignoreCase);
|
||||
}
|
||||
if (articles == null) {
|
||||
articles = FrenchAnalyzer.DEFAULT_ARTICLES;
|
||||
}
|
||||
}
|
||||
|
||||
public ElisionFilter create(TokenStream input) {
|
||||
assureMatchVersion();
|
||||
return articles == null ? new ElisionFilter(luceneMatchVersion,input) :
|
||||
new ElisionFilter(luceneMatchVersion,input,articles);
|
||||
return new ElisionFilter(input, articles);
|
||||
}
|
||||
}
|
||||
|
|
@ -40,7 +40,6 @@ org.apache.lucene.analysis.en.PorterStemFilterFactory
|
|||
org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
|
||||
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
|
||||
org.apache.lucene.analysis.fr.ElisionFilterFactory
|
||||
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
|
||||
org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory
|
||||
org.apache.lucene.analysis.ga.IrishLowerCaseFilterFactory
|
||||
|
@ -88,3 +87,4 @@ org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
|
|||
org.apache.lucene.analysis.synonym.SynonymFilterFactory
|
||||
org.apache.lucene.analysis.th.ThaiWordFilterFactory
|
||||
org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory
|
||||
org.apache.lucene.analysis.util.ElisionFilterFactory
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.fr;
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -28,6 +28,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
@ -41,7 +42,7 @@ public class TestElision extends BaseTokenStreamTestCase {
|
|||
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
|
||||
CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false);
|
||||
TokenFilter filter = new ElisionFilter(TEST_VERSION_CURRENT, tokenizer, articles);
|
||||
TokenFilter filter = new ElisionFilter(tokenizer, articles);
|
||||
List<String> tas = filter(filter);
|
||||
assertEquals("embrouille", tas.get(4));
|
||||
assertEquals("O'brian", tas.get(6));
|
||||
|
@ -62,7 +63,7 @@ public class TestElision extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new ElisionFilter(TEST_VERSION_CURRENT, tokenizer));
|
||||
return new TokenStreamComponents(tokenizer, new ElisionFilter(tokenizer, FrenchAnalyzer.DEFAULT_ARTICLES));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.fr;
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
Loading…
Reference in New Issue