LUCENE-7318: Forward port some changes (add StopFilter and LowercaseFilter at their original location)

This commit is contained in:
Uwe Schindler 2016-09-12 19:47:28 +02:00
parent 86e4af60f0
commit b39fcc1202
13 changed files with 116 additions and 16 deletions

View File

@ -56,6 +56,11 @@ Bug Fixes
* LUCENE-7442: MinHashFilter's ctor should validate its args.
(Cao Manh Dat via Steve Rowe)
* LUCENE-7318: Fix backwards compatibility issues around StandardAnalyzer
and its components, introduced with Lucene 6.2.0. The moved classes
were restored in their original packages: LowercaseFilter and StopFilter,
as well as several utility classes. (Uwe Schindler, Mike McCandless)
Improvements
Optimizations

View File

@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.TokenStream;
/**
* Normalizes token text to lower case.
* <p>
* This class moved to Lucene Core, but a reference in the {@code analysis/common} module
* is preserved for documentation purposes and consistency with filter factory.
* @see org.apache.lucene.analysis.LowerCaseFilter
* @see LowerCaseFilterFactory
*/
public final class LowerCaseFilter extends org.apache.lucene.analysis.LowerCaseFilter {
/**
* Create a new LowerCaseFilter, that normalizes token text to lower case.
*
* @param in TokenStream to filter
*/
public LowerCaseFilter(TokenStream in) {
super(in);
}
}

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.core;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;

View File

@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
/**
* Removes stop words from a token stream.
* <p>
* This class moved to Lucene Core, but a reference in the {@code analysis/common} module
* is preserved for documentation purposes and consistency with filter factory.
* @see org.apache.lucene.analysis.StopFilter
* @see StopFilterFactory
*/
public final class StopFilter extends org.apache.lucene.analysis.StopFilter {
/**
* Constructs a filter which removes words from the input TokenStream that are
* named in the Set.
*
* @param in
* Input stream
* @param stopWords
* A {@link CharArraySet} representing the stopwords.
* @see #makeStopSet(java.lang.String...)
*/
public StopFilter(TokenStream in, CharArraySet stopWords) {
super(in, stopWords);
}
}

View File

@ -21,7 +21,6 @@ import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader; // jdocs
import org.apache.lucene.analysis.util.ResourceLoader;

View File

@ -46,5 +46,9 @@
and {@link org.apache.lucene.analysis.StopFilter StopFilter}.
</li>
</ul>
<p>
This Java package additionally contains {@code StandardAnalyzer}, {@code StandardTokenizer},
and {@code StandardFilter}, which are not visible here, because they moved to Lucene Core.
The factories for those components (e.g., used in Solr) are still part of this module.
</body>
</html>

View File

@ -106,7 +106,9 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
SnowballFilter.class, // this is called SnowballPorterFilterFactory
PatternKeywordMarkerFilter.class,
SetKeywordMarkerFilter.class,
UnicodeWhitespaceTokenizer.class // a supported option via WhitespaceTokenizerFactory
UnicodeWhitespaceTokenizer.class, // a supported option via WhitespaceTokenizerFactory
org.apache.lucene.analysis.StopFilter.class, // class from core, but StopFilterFactory creates one from this module
org.apache.lucene.analysis.LowerCaseFilter.class // class from core, but LowerCaseFilterFactory creates one from this module
);
}

View File

@ -166,7 +166,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
// also randomly pick it:
ValidatingTokenFilter.class,
// TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain)
WordDelimiterFilter.class)) {
WordDelimiterFilter.class,
// clones of core's filters:
org.apache.lucene.analysis.core.StopFilter.class,
org.apache.lucene.analysis.core.LowerCaseFilter.class)) {
for (Constructor<?> ctor : c.getConstructors()) {
brokenConstructors.put(ctor, ALWAYS);
}

View File

@ -27,7 +27,7 @@ import org.apache.lucene.analysis.CharacterUtils;
/**
* Normalizes token text to lower case.
*/
public final class LowerCaseFilter extends TokenFilter {
public class LowerCaseFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**

View File

@ -28,7 +28,7 @@ import org.apache.lucene.analysis.CharArraySet;
/**
* Removes stop words from a token stream.
*/
public final class StopFilter extends FilteringTokenFilter {
public class StopFilter extends FilteringTokenFilter {
private final CharArraySet stopWords;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

View File

@ -19,7 +19,7 @@ import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.NamedList;

View File

@ -278,11 +278,11 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
assertNotNull("Expecting the 'StandardFilter' to be applied on the query for the 'text' field", tokenList);
assertEquals("Query has only one token", 1, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1}, null, false));
tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.LowerCaseFilter");
tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.core.LowerCaseFilter");
assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the query for the 'text' field", tokenList);
assertEquals("Query has only one token", 1, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("jumping", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1,1}, null, false));
tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.StopFilter");
tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.core.StopFilter");
assertNotNull("Expecting the 'StopFilter' to be applied on the query for the 'text' field", tokenList);
assertEquals("Query has only one token", 1, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("jumping", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1,1,1}, null, false));
@ -311,7 +311,7 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
assertToken(tokenList.get(3), new TokenInfo("Over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4}, null, false));
assertToken(tokenList.get(4), new TokenInfo("The", null, "<ALPHANUM>", 20, 23, 5, new int[]{5,5}, null, false));
assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6}, null, false));
tokenList = valueResult.get("org.apache.lucene.analysis.LowerCaseFilter");
tokenList = valueResult.get("org.apache.lucene.analysis.core.LowerCaseFilter");
assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the index for the 'text' field", tokenList);
assertEquals("Expecting 6 tokens", 6, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("the", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1}, null, false));
@ -320,7 +320,7 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
assertToken(tokenList.get(3), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4}, null, false));
assertToken(tokenList.get(4), new TokenInfo("the", null, "<ALPHANUM>", 20, 23, 5, new int[]{5,5,5}, null, false));
assertToken(tokenList.get(5), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6}, null, false));
tokenList = valueResult.get("org.apache.lucene.analysis.StopFilter");
tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter");
assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList);
assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2}, null, false));

View File

@ -209,7 +209,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
assertToken(tokenList.get(7), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8}, null, false));
assertToken(tokenList.get(8), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9}, null, true));
assertToken(tokenList.get(9), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10}, null, false));
tokenList = indexPart.get("org.apache.lucene.analysis.LowerCaseFilter");
tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList);
assertEquals(tokenList.size(), 10);
assertToken(tokenList.get(0), new TokenInfo("the", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1}, null, false));
@ -222,7 +222,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
assertToken(tokenList.get(7), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8}, null, false));
assertToken(tokenList.get(8), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9}, null, true));
assertToken(tokenList.get(9), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10}, null, false));
tokenList = indexPart.get("org.apache.lucene.analysis.StopFilter");
tokenList = indexPart.get("org.apache.lucene.analysis.core.StopFilter");
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
assertEquals(tokenList.size(), 8);
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2}, null, false));
@ -258,12 +258,12 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
assertEquals(2, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("brown", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2}, null, false));
tokenList = queryPart.get("org.apache.lucene.analysis.LowerCaseFilter");
tokenList = queryPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList);
assertEquals(2, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("brown", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2}, null, false));
tokenList = queryPart.get("org.apache.lucene.analysis.StopFilter");
tokenList = queryPart.get("org.apache.lucene.analysis.core.StopFilter");
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
assertEquals(2, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1,1}, null, false));
@ -416,7 +416,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3}, null, false));
assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4}, null, false));
assertToken(tokenList.get(5), new TokenInfo("Test", null, "word", 14, 18, 5, new int[]{4,5}, null, false));
tokenList = indexPart.get("org.apache.lucene.analysis.LowerCaseFilter");
tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList);
assertEquals(6, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1,1}, null, false));