From bf73f1f28b408c8a4011a08fdd9de2e465d5f952 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 30 Jul 2012 14:03:27 +0000 Subject: [PATCH] LUCENE-3884: Move ElisionFilter out of .fr package git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1367096 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 ++ .../lucene/analysis/ca/CatalanAnalyzer.java | 4 +- .../lucene/analysis/fr/FrenchAnalyzer.java | 9 +++- .../lucene/analysis/ga/IrishAnalyzer.java | 4 +- .../lucene/analysis/it/ItalianAnalyzer.java | 4 +- .../analysis/{fr => util}/ElisionFilter.java | 44 +++++-------------- .../{fr => util}/ElisionFilterFactory.java | 12 ++--- ...he.lucene.analysis.util.TokenFilterFactory | 2 +- .../analysis/{fr => util}/TestElision.java | 7 +-- .../TestElisionFilterFactory.java | 2 +- .../analysis/{fr => util}/frenchArticles.txt | 0 11 files changed, 41 insertions(+), 50 deletions(-) rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{fr => util}/ElisionFilter.java (59%) rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{fr => util}/ElisionFilterFactory.java (88%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{fr => util}/TestElision.java (92%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{fr => util}/TestElisionFilterFactory.java (98%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{fr => util}/frenchArticles.txt (100%) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1f73231b55a..66a1f7fa01e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -69,6 +69,9 @@ API Changes * LUCENE-3747: Support Unicode 6.1.0. (Steve Rowe) +* LUCENE-3884: Moved ElisionFilter out of org.apache.lucene.analysis.fr + package into org.apache.lucene.analysis.util. (Robert Muir) + Optimizations * LUCENE-4171: Performance improvements to Packed64. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java index 2dc4e695389..6101ab8e77e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java @@ -24,7 +24,6 @@ import java.util.Arrays; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; -import org.apache.lucene.analysis.fr.ElisionFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -32,6 +31,7 @@ import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ElisionFilter; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.CatalanStemmer; @@ -127,7 +127,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase { Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); - result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES); + result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if(!stemExclusionSet.isEmpty()) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java index 778e750f8ca..9a6016bce1c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ElisionFilter; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.util.IOUtils; @@ -35,6 +36,7 @@ import org.apache.lucene.util.Version; import java.io.IOException; import java.io.Reader; +import java.util.Arrays; /** * {@link Analyzer} for French language. @@ -54,6 +56,11 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase { /** File containing default French stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt"; + /** Default set of articles for ElisionFilter */ + public static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( + new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList( + "l", "m", "t", "qu", "n", "s", "j"), true)); + /** * Contains words that should be indexed but not stemmed. */ @@ -134,7 +141,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase { Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); - result = new ElisionFilter(matchVersion, result); + result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if(!excltable.isEmpty()) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java index d9db3bed209..f716cdb28a7 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java @@ -23,7 +23,6 @@ import java.util.Arrays; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.StopFilter; -import org.apache.lucene.analysis.fr.ElisionFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -31,6 +30,7 @@ import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ElisionFilter; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.IrishStemmer; @@ -140,7 +140,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase { StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS); s.setEnablePositionIncrements(false); result = s; - result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES); + result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if(!stemExclusionSet.isEmpty()) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java index c6aedf2309b..086d7bcbcb2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java @@ -24,7 +24,6 @@ import java.util.Arrays; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; -import org.apache.lucene.analysis.fr.ElisionFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -32,6 +31,7 @@ import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ElisionFilter; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.util.IOUtils; @@ -129,7 +129,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); - result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES); + result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if(!stemExclusionSet.isEmpty()) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilter.java similarity index 59% rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilter.java index a05a8f9dc25..c04d28760ad 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilter.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.fr; +package org.apache.lucene.analysis.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -18,13 +18,11 @@ package org.apache.lucene.analysis.fr; */ import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.util.Version; /** * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be @@ -33,31 +31,17 @@ import org.apache.lucene.util.Version; * @see Elision in Wikipedia */ public final class ElisionFilter extends TokenFilter { - private CharArraySet articles = CharArraySet.EMPTY_SET; + private final CharArraySet articles; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( - new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList( - "l", "m", "t", "qu", "n", "s", "j"), true)); - private static char[] apostrophes = {'\'', '\u2019'}; - - /** - * Constructs an elision filter with standard stop words - */ - public ElisionFilter(Version matchVersion, TokenStream input) { - this(matchVersion, input, DEFAULT_ARTICLES); - } - /** * Constructs an elision filter with a Set of stop words - * @param matchVersion the lucene backwards compatibility version * @param input the source {@link TokenStream} * @param articles a set of stopword articles */ - public ElisionFilter(Version matchVersion, TokenStream input, CharArraySet articles) { + public ElisionFilter(TokenStream input, CharArraySet articles) { super(input); - this.articles = CharArraySet.unmodifiableSet( - new CharArraySet(matchVersion, articles, true)); + this.articles = articles; } /** @@ -69,22 +53,18 @@ public final class ElisionFilter extends TokenFilter { char[] termBuffer = termAtt.buffer(); int termLength = termAtt.length(); - int minPoz = Integer.MAX_VALUE; - for (int i = 0; i < apostrophes.length; i++) { - char apos = apostrophes[i]; - // The equivalent of String.indexOf(ch) - for (int poz = 0; poz < termLength ; poz++) { - if (termBuffer[poz] == apos) { - minPoz = Math.min(poz, minPoz); - break; - } + int index = -1; + for (int i = 0; i < termLength; i++) { + char ch = termBuffer[i]; + if (ch == '\'' || ch == '\u2019') { + index = i; + break; } } // An apostrophe has been found. If the prefix is an article strip it off. - if (minPoz != Integer.MAX_VALUE - && articles.contains(termAtt.buffer(), 0, minPoz)) { - termAtt.copyBuffer(termAtt.buffer(), minPoz + 1, termAtt.length() - (minPoz + 1)); + if (index >= 0 && articles.contains(termBuffer, 0, index)) { + termAtt.copyBuffer(termBuffer, index + 1, termLength - (index + 1)); } return true; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java similarity index 88% rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilterFactory.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java index 91f9f3a5494..aec6687b6bc 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ElisionFilterFactory.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.fr; +package org.apache.lucene.analysis.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,10 +17,9 @@ package org.apache.lucene.analysis.fr; * limitations under the License. */ -import org.apache.lucene.analysis.util.*; - import java.io.IOException; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fr.FrenchAnalyzer; /** * Factory for {@link ElisionFilter}. @@ -46,12 +45,13 @@ public class ElisionFilterFactory extends TokenFilterFactory implements Resource if (articlesFile != null) { articles = getWordSet(loader, articlesFile, ignoreCase); } + if (articles == null) { + articles = FrenchAnalyzer.DEFAULT_ARTICLES; + } } public ElisionFilter create(TokenStream input) { - assureMatchVersion(); - return articles == null ? new ElisionFilter(luceneMatchVersion,input) : - new ElisionFilter(luceneMatchVersion,input,articles); + return new ElisionFilter(input, articles); } } diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 0abe4b981ee..f790d02d3e4 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -40,7 +40,6 @@ org.apache.lucene.analysis.en.PorterStemFilterFactory org.apache.lucene.analysis.es.SpanishLightStemFilterFactory org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory -org.apache.lucene.analysis.fr.ElisionFilterFactory org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory org.apache.lucene.analysis.ga.IrishLowerCaseFilterFactory @@ -88,3 +87,4 @@ org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory org.apache.lucene.analysis.synonym.SynonymFilterFactory org.apache.lucene.analysis.th.ThaiWordFilterFactory org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory +org.apache.lucene.analysis.util.ElisionFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java similarity index 92% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java index aae0052d871..c48c86c47c8 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.fr; +package org.apache.lucene.analysis.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; @@ -41,7 +42,7 @@ public class TestElision extends BaseTokenStreamTestCase { String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test)); CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false); - TokenFilter filter = new ElisionFilter(TEST_VERSION_CURRENT, tokenizer, articles); + TokenFilter filter = new ElisionFilter(tokenizer, articles); List tas = filter(filter); assertEquals("embrouille", tas.get(4)); assertEquals("O'brian", tas.get(6)); @@ -62,7 +63,7 @@ public class TestElision extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); - return new TokenStreamComponents(tokenizer, new ElisionFilter(TEST_VERSION_CURRENT, tokenizer)); + return new TokenStreamComponents(tokenizer, new ElisionFilter(tokenizer, FrenchAnalyzer.DEFAULT_ARTICLES)); } }; checkOneTermReuse(a, "", ""); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElisionFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElisionFilterFactory.java similarity index 98% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElisionFilterFactory.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElisionFilterFactory.java index daef999b67e..dbdc621a6e1 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElisionFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElisionFilterFactory.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.fr; +package org.apache.lucene.analysis.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/frenchArticles.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/frenchArticles.txt similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/frenchArticles.txt rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/util/frenchArticles.txt