Lucene-10008: Respect ignoreCase flag in CommonGramsFilterFactory and factor out a common abstract base class AbstractWordsFileFilterFactory.java (#188)

This commit is contained in:
Vigya Sharma 2021-08-13 11:45:58 -07:00 committed by GitHub
parent 624560a3d7
commit cb4c8ae07f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 255 additions and 119 deletions

View File

@ -16,15 +16,12 @@
*/
package org.apache.lucene.analysis.commongrams;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
/**
* Constructs a {@link CommonGramsFilter}.
@ -40,26 +37,14 @@ import org.apache.lucene.util.ResourceLoaderAware;
* @since 3.1
* @lucene.spi {@value #NAME}
*/
public class CommonGramsFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public class CommonGramsFilterFactory extends AbstractWordsFileFilterFactory {
/** SPI name */
public static final String NAME = "commonGrams";
// TODO: shared base class for Stop/Keep/CommonGrams?
private CharArraySet commonWords;
private final String commonWordFiles;
private final String format;
private final boolean ignoreCase;
/** Creates a new CommonGramsFilterFactory */
public CommonGramsFilterFactory(Map<String, String> args) {
super(args);
commonWordFiles = get(args, "words");
format = get(args, "format");
ignoreCase = getBoolean(args, "ignoreCase", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
/** Default ctor for compatibility with SPI */
@ -67,30 +52,18 @@ public class CommonGramsFilterFactory extends TokenFilterFactory implements Reso
throw defaultCtorException();
}
@Override
public void inform(ResourceLoader loader) throws IOException {
if (commonWordFiles != null) {
if ("snowball".equalsIgnoreCase(format)) {
commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
} else {
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
}
} else {
commonWords = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET;
}
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public CharArraySet getCommonWords() {
return commonWords;
return getWords();
}
@Override
protected CharArraySet createDefaultWords() {
return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, isIgnoreCase());
}
@Override
public TokenFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords);
CommonGramsFilter commonGrams = new CommonGramsFilter(input, getWords());
return commonGrams;
}
}

View File

@ -16,15 +16,12 @@
*/
package org.apache.lucene.analysis.core;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
/**
* Factory for {@link StopFilter}.
@ -65,28 +62,14 @@ import org.apache.lucene.util.ResourceLoaderAware;
* @since 3.1
* @lucene.spi {@value #NAME}
*/
public class StopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public class StopFilterFactory extends AbstractWordsFileFilterFactory {
/** SPI name */
public static final String NAME = "stop";
public static final String FORMAT_WORDSET = "wordset";
public static final String FORMAT_SNOWBALL = "snowball";
private CharArraySet stopWords;
private final String stopWordFiles;
private final String format;
private final boolean ignoreCase;
/** Creates a new StopFilterFactory */
public StopFilterFactory(Map<String, String> args) {
super(args);
stopWordFiles = get(args, "words");
format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
ignoreCase = getBoolean(args, "ignoreCase", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
/** Default ctor for compatibility with SPI */
@ -94,37 +77,18 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
throw defaultCtorException();
}
@Override
public void inform(ResourceLoader loader) throws IOException {
if (stopWordFiles != null) {
if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
} else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
} else {
throw new IllegalArgumentException(
"Unknown 'format' specified for 'words' file: " + format);
}
} else {
if (null != format) {
throw new IllegalArgumentException(
"'format' can not be specified w/o an explicit 'words' file: " + format);
}
stopWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
}
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public CharArraySet getStopWords() {
return stopWords;
return getWords();
}
@Override
protected CharArraySet createDefaultWords() {
return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, isIgnoreCase());
}
@Override
public TokenStream create(TokenStream input) {
StopFilter stopFilter = new StopFilter(input, stopWords);
StopFilter stopFilter = new StopFilter(input, getWords());
return stopFilter;
}
}

View File

@ -0,0 +1,122 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.en;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
/**
* Abstract parent class for analysis factories that accept a stopwords file as input.
*
* <p>Concrete implementations can leverage the following input attributes. All attributes are
* optional:
*
* <ul>
* <li><code>ignoreCase</code> defaults to <code>false</code>
* <li><code>words</code> should be the name of a stopwords file to parse, if not specified the
* factory will use the value provided by {@link #createDefaultWords()} implementation in
* concrete subclass.
* <li><code>format</code> defines how the <code>words</code> file will be parsed, and defaults to
* <code>wordset</code>. If <code>words</code> is not specified, then <code>format</code> must
* not be specified.
* </ul>
*
* <p>The valid values for the <code>format</code> option are:
*
* <ul>
* <li><code>wordset</code> - This is the default format, which supports one word per line
* (including any intra-word whitespace) and allows whole line comments beginning with the "#"
* character. Blank lines are ignored. See {@link WordlistLoader#getLines
* WordlistLoader.getLines} for details.
* <li><code>snowball</code> - This format allows for multiple words specified on each line, and
* trailing comments may be specified using the vertical line ("&#124;"). Blank lines are
* ignored. See {@link WordlistLoader#getSnowballWordSet WordlistLoader.getSnowballWordSet}
* for details.
* </ul>
*/
public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory
implements ResourceLoaderAware {
public static final String FORMAT_WORDSET = "wordset";
public static final String FORMAT_SNOWBALL = "snowball";
private CharArraySet words;
private final String wordFiles;
private final String format;
private final boolean ignoreCase;
/** Default ctor for compatibility with SPI */
protected AbstractWordsFileFilterFactory() {
throw defaultCtorException();
}
/** Initialize this factory via a set of key-value pairs. */
public AbstractWordsFileFilterFactory(Map<String, String> args) {
super(args);
wordFiles = get(args, "words");
format = get(args, "format", (null == wordFiles ? null : FORMAT_WORDSET));
ignoreCase = getBoolean(args, "ignoreCase", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
/** Initialize the set of stopwords provided via ResourceLoader, or using defaults. */
@Override
public void inform(ResourceLoader loader) throws IOException {
if (wordFiles != null) {
if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
words = getWordSet(loader, wordFiles, ignoreCase);
} else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
words = getSnowballWordSet(loader, wordFiles, ignoreCase);
} else {
throw new IllegalArgumentException(
"Unknown 'format' specified for 'words' file: " + format);
}
} else {
if (null != format) {
throw new IllegalArgumentException(
"'format' can not be specified w/o an explicit 'words' file: " + format);
}
words = createDefaultWords();
}
}
/** Default word set implementation. */
protected abstract CharArraySet createDefaultWords();
public CharArraySet getWords() {
return words;
}
public String getWordFiles() {
return wordFiles;
}
public String getFormat() {
return format;
}
public boolean isIgnoreCase() {
return ignoreCase;
}
}

View File

@ -16,13 +16,10 @@
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
/**
* Factory for {@link KeepWordFilter}.
@ -38,23 +35,14 @@ import org.apache.lucene.util.ResourceLoaderAware;
* @since 3.1
* @lucene.spi {@value #NAME}
*/
public class KeepWordFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public class KeepWordFilterFactory extends AbstractWordsFileFilterFactory {
/** SPI name */
public static final String NAME = "keepWord";
private final boolean ignoreCase;
private final String wordFiles;
private CharArraySet words;
/** Creates a new KeepWordFilterFactory */
public KeepWordFilterFactory(Map<String, String> args) {
super(args);
wordFiles = get(args, "words");
ignoreCase = getBoolean(args, "ignoreCase", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
/** Default ctor for compatibility with SPI */
@ -63,27 +51,17 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
}
@Override
public void inform(ResourceLoader loader) throws IOException {
if (wordFiles != null) {
words = getWordSet(loader, wordFiles, ignoreCase);
}
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public CharArraySet getWords() {
return words;
protected CharArraySet createDefaultWords() {
return null;
}
@Override
public TokenStream create(TokenStream input) {
// if the set is null, it means it was empty
if (words == null) {
if (getWords() == null) {
return input;
} else {
final TokenStream filter = new KeepWordFilter(input, words);
final TokenStream filter = new KeepWordFilter(input, getWords());
return filter;
}
}

View File

@ -22,25 +22,25 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.TestStopFilterFactory;
import org.apache.lucene.util.ClasspathResourceLoader;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.Version;
/**
* Tests pretty much copied from StopFilterFactoryTest We use the test files used by the
* StopFilterFactoryTest TODO: consider creating separate test files so this won't break if stop
* filter test files change
*/
public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testInform() throws Exception {
ResourceLoader loader = new ClasspathResourceLoader(TestStopFilterFactory.class);
ResourceLoader loader = new ClasspathResourceLoader(getClass());
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsFilterFactory factory =
(CommonGramsFilterFactory)
tokenFilterFactory(
"CommonGrams", Version.LATEST, loader, "words", "stop-1.txt", "ignoreCase", "true");
"CommonGrams",
Version.LATEST,
loader,
"words",
"common-1.txt",
"ignoreCase",
"true");
CharArraySet words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
@ -53,7 +53,7 @@ public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase
Version.LATEST,
loader,
"words",
"stop-1.txt, stop-2.txt",
"common-1.txt, common-2.txt",
"ignoreCase",
"true");
words = factory.getCommonWords();
@ -68,7 +68,7 @@ public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase
Version.LATEST,
loader,
"words",
"stop-snowball.txt",
"common-snowball.txt",
"format",
"snowball",
"ignoreCase",
@ -98,6 +98,25 @@ public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase
stream, new String[] {"testing", "testing_the", "the", "the_factory", "factory"});
}
/**
* Test that ignoreCase flag is honored when no words are provided and default stopwords are used.
*/
public void testIgnoreCase() throws Exception {
ResourceLoader loader = new ClasspathResourceLoader(getClass());
CommonGramsFilterFactory factory =
(CommonGramsFilterFactory)
tokenFilterFactory("CommonGrams", Version.LATEST, loader, "ignoreCase", "true");
CharArraySet words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the"));
assertTrue(words.contains("The"));
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(new StringReader("testing The factory"));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(
stream, new String[] {"testing", "testing_The", "The", "The_factory", "factory"});
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected =

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
foo
bar

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
junk
more

View File

@ -0,0 +1,10 @@
| This is a file in snowball format, empty lines are ignored, '|' is a comment
| Additionally, multiple words can be on the same line, allowing stopwords to be
| arranged in tables (useful in some languages where they might inflect)
| fictitious table below
|third person singular
|Subject Object Possessive Reflexive
he him his himself| masculine
she her hers herself| feminine

View File

@ -39,6 +39,32 @@ public class TestKeepFilterFactory extends BaseTokenStreamFactoryTestCase {
words = factory.getWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
factory =
(KeepWordFilterFactory)
tokenFilterFactory(
"KeepWord",
"words",
"keep-snowball.txt",
"format",
"snowball",
"ignoreCase",
"true");
words = factory.getWords();
assertEquals(8, words.size());
assertTrue(words.contains("he"));
assertTrue(words.contains("him"));
assertTrue(words.contains("his"));
assertTrue(words.contains("himself"));
assertTrue(words.contains("she"));
assertTrue(words.contains("her"));
assertTrue(words.contains("hers"));
assertTrue(words.contains("herself"));
// defaults
factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord");
assertTrue(factory.getWords() == null);
assertEquals(false, factory.isIgnoreCase());
}
/** Test that bogus arguments result in exception */

View File

@ -0,0 +1,10 @@
| This is a file in snowball format, empty lines are ignored, '|' is a comment
| Additionally, multiple words can be on the same line, allowing stopwords to be
| arranged in tables (useful in some languages where they might inflect)
| fictitious table below
|third person singular
|Subject Object Possessive Reflexive
he him his himself| masculine
she her hers herself| feminine