LUCENE-3884: Move ElisionFilter out of .fr package

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1367096 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-07-30 14:03:27 +00:00
parent e101167585
commit bf73f1f28b
11 changed files with 41 additions and 50 deletions

View File

@ -69,6 +69,9 @@ API Changes
* LUCENE-3747: Support Unicode 6.1.0. (Steve Rowe)
* LUCENE-3884: Moved ElisionFilter out of org.apache.lucene.analysis.fr
package into org.apache.lucene.analysis.util. (Robert Muir)
Optimizations
* LUCENE-4171: Performance improvements to Packed64.

View File

@ -24,7 +24,6 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.fr.ElisionFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -32,6 +31,7 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.CatalanStemmer;
@ -127,7 +127,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
@ -35,6 +36,7 @@ import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
/**
* {@link Analyzer} for French language.
@ -54,6 +56,11 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
/** File containing default French stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
/** Default set of articles for ElisionFilter */
public static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
"l", "m", "t", "qu", "n", "s", "j"), true));
/**
* Contains words that should be indexed but not stemmed.
*/
@ -134,7 +141,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new ElisionFilter(matchVersion, result);
result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!excltable.isEmpty())

View File

@ -23,7 +23,6 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.fr.ElisionFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -31,6 +30,7 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.IrishStemmer;
@ -140,7 +140,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
s.setEnablePositionIncrements(false);
result = s;
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new IrishLowerCaseFilter(result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -24,7 +24,6 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.fr.ElisionFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -32,6 +31,7 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
@ -129,7 +129,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.fr;
package org.apache.lucene.analysis.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -18,13 +18,11 @@ package org.apache.lucene.analysis.fr;
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
@ -33,31 +31,17 @@ import org.apache.lucene.util.Version;
* @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
*/
public final class ElisionFilter extends TokenFilter {
private CharArraySet articles = CharArraySet.EMPTY_SET;
private final CharArraySet articles;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
"l", "m", "t", "qu", "n", "s", "j"), true));
private static char[] apostrophes = {'\'', '\u2019'};
/**
* Constructs an elision filter with standard stop words
*/
public ElisionFilter(Version matchVersion, TokenStream input) {
this(matchVersion, input, DEFAULT_ARTICLES);
}
/**
* Constructs an elision filter with a Set of stop words
* @param matchVersion the lucene backwards compatibility version
* @param input the source {@link TokenStream}
* @param articles a set of stopword articles
*/
public ElisionFilter(Version matchVersion, TokenStream input, CharArraySet articles) {
public ElisionFilter(TokenStream input, CharArraySet articles) {
super(input);
this.articles = CharArraySet.unmodifiableSet(
new CharArraySet(matchVersion, articles, true));
this.articles = articles;
}
/**
@ -69,22 +53,18 @@ public final class ElisionFilter extends TokenFilter {
char[] termBuffer = termAtt.buffer();
int termLength = termAtt.length();
int minPoz = Integer.MAX_VALUE;
for (int i = 0; i < apostrophes.length; i++) {
char apos = apostrophes[i];
// The equivalent of String.indexOf(ch)
for (int poz = 0; poz < termLength ; poz++) {
if (termBuffer[poz] == apos) {
minPoz = Math.min(poz, minPoz);
break;
}
int index = -1;
for (int i = 0; i < termLength; i++) {
char ch = termBuffer[i];
if (ch == '\'' || ch == '\u2019') {
index = i;
break;
}
}
// An apostrophe has been found. If the prefix is an article strip it off.
if (minPoz != Integer.MAX_VALUE
&& articles.contains(termAtt.buffer(), 0, minPoz)) {
termAtt.copyBuffer(termAtt.buffer(), minPoz + 1, termAtt.length() - (minPoz + 1));
if (index >= 0 && articles.contains(termBuffer, 0, index)) {
termAtt.copyBuffer(termBuffer, index + 1, termLength - (index + 1));
}
return true;

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.fr;
package org.apache.lucene.analysis.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,10 +17,9 @@ package org.apache.lucene.analysis.fr;
* limitations under the License.
*/
import org.apache.lucene.analysis.util.*;
import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
/**
* Factory for {@link ElisionFilter}.
@ -46,12 +45,13 @@ public class ElisionFilterFactory extends TokenFilterFactory implements Resource
if (articlesFile != null) {
articles = getWordSet(loader, articlesFile, ignoreCase);
}
if (articles == null) {
articles = FrenchAnalyzer.DEFAULT_ARTICLES;
}
}
public ElisionFilter create(TokenStream input) {
assureMatchVersion();
return articles == null ? new ElisionFilter(luceneMatchVersion,input) :
new ElisionFilter(luceneMatchVersion,input,articles);
return new ElisionFilter(input, articles);
}
}

View File

@ -40,7 +40,6 @@ org.apache.lucene.analysis.en.PorterStemFilterFactory
org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
org.apache.lucene.analysis.fr.ElisionFilterFactory
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory
org.apache.lucene.analysis.ga.IrishLowerCaseFilterFactory
@ -88,3 +87,4 @@ org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
org.apache.lucene.analysis.synonym.SynonymFilterFactory
org.apache.lucene.analysis.th.ThaiWordFilterFactory
org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory
org.apache.lucene.analysis.util.ElisionFilterFactory

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.fr;
package org.apache.lucene.analysis.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -28,6 +28,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
@ -41,7 +42,7 @@ public class TestElision extends BaseTokenStreamTestCase {
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false);
TokenFilter filter = new ElisionFilter(TEST_VERSION_CURRENT, tokenizer, articles);
TokenFilter filter = new ElisionFilter(tokenizer, articles);
List<String> tas = filter(filter);
assertEquals("embrouille", tas.get(4));
assertEquals("O'brian", tas.get(6));
@ -62,7 +63,7 @@ public class TestElision extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new ElisionFilter(TEST_VERSION_CURRENT, tokenizer));
return new TokenStreamComponents(tokenizer, new ElisionFilter(tokenizer, FrenchAnalyzer.DEFAULT_ARTICLES));
}
};
checkOneTermReuse(a, "", "");

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.fr;
package org.apache.lucene.analysis.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more