mirror of https://github.com/apache/lucene.git
LUCENE-7318: Forward port some changes (add StopFilter and LowercaseFilter at their original location)
This commit is contained in:
parent
86e4af60f0
commit
b39fcc1202
|
@ -56,6 +56,11 @@ Bug Fixes
|
|||
* LUCENE-7442: MinHashFilter's ctor should validate its args.
|
||||
(Cao Manh Dat via Steve Rowe)
|
||||
|
||||
* LUCENE-7318: Fix backwards compatibility issues around StandardAnalyzer
|
||||
and its components, introduced with Lucene 6.2.0. The moved classes
|
||||
were restored in their original packages: LowercaseFilter and StopFilter,
|
||||
as well as several utility classes. (Uwe Schindler, Mike McCandless)
|
||||
|
||||
Improvements
|
||||
|
||||
Optimizations
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.core;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case.
|
||||
* <p>
|
||||
* This class moved to Lucene Core, but a reference in the {@code analysis/common} module
|
||||
* is preserved for documentation purposes and consistency with filter factory.
|
||||
* @see org.apache.lucene.analysis.LowerCaseFilter
|
||||
* @see LowerCaseFilterFactory
|
||||
*/
|
||||
public final class LowerCaseFilter extends org.apache.lucene.analysis.LowerCaseFilter {
|
||||
|
||||
/**
|
||||
* Create a new LowerCaseFilter, that normalizes token text to lower case.
|
||||
*
|
||||
* @param in TokenStream to filter
|
||||
*/
|
||||
public LowerCaseFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
}
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.core;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.core;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Removes stop words from a token stream.
|
||||
* <p>
|
||||
* This class moved to Lucene Core, but a reference in the {@code analysis/common} module
|
||||
* is preserved for documentation purposes and consistency with filter factory.
|
||||
* @see org.apache.lucene.analysis.StopFilter
|
||||
* @see StopFilterFactory
|
||||
*/
|
||||
public final class StopFilter extends org.apache.lucene.analysis.StopFilter {
|
||||
|
||||
/**
|
||||
* Constructs a filter which removes words from the input TokenStream that are
|
||||
* named in the Set.
|
||||
*
|
||||
* @param in
|
||||
* Input stream
|
||||
* @param stopWords
|
||||
* A {@link CharArraySet} representing the stopwords.
|
||||
* @see #makeStopSet(java.lang.String...)
|
||||
*/
|
||||
public StopFilter(TokenStream in, CharArraySet stopWords) {
|
||||
super(in, stopWords);
|
||||
}
|
||||
|
||||
}
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WordlistLoader; // jdocs
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
|
|
|
@ -46,5 +46,9 @@
|
|||
and {@link org.apache.lucene.analysis.StopFilter StopFilter}.
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
This Java package additionally contains {@code StandardAnalyzer}, {@code StandardTokenizer},
|
||||
and {@code StandardFilter}, which are not visible here, because they moved to Lucene Core.
|
||||
The factories for those components (e.g., used in Solr) are still part of this module.
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -106,7 +106,9 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
|
|||
SnowballFilter.class, // this is called SnowballPorterFilterFactory
|
||||
PatternKeywordMarkerFilter.class,
|
||||
SetKeywordMarkerFilter.class,
|
||||
UnicodeWhitespaceTokenizer.class // a supported option via WhitespaceTokenizerFactory
|
||||
UnicodeWhitespaceTokenizer.class, // a supported option via WhitespaceTokenizerFactory
|
||||
org.apache.lucene.analysis.StopFilter.class, // class from core, but StopFilterFactory creates one from this module
|
||||
org.apache.lucene.analysis.LowerCaseFilter.class // class from core, but LowerCaseFilterFactory creates one from this module
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -166,7 +166,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
// also randomly pick it:
|
||||
ValidatingTokenFilter.class,
|
||||
// TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain)
|
||||
WordDelimiterFilter.class)) {
|
||||
WordDelimiterFilter.class,
|
||||
// clones of core's filters:
|
||||
org.apache.lucene.analysis.core.StopFilter.class,
|
||||
org.apache.lucene.analysis.core.LowerCaseFilter.class)) {
|
||||
for (Constructor<?> ctor : c.getConstructors()) {
|
||||
brokenConstructors.put(ctor, ALWAYS);
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ import org.apache.lucene.analysis.CharacterUtils;
|
|||
/**
|
||||
* Normalizes token text to lower case.
|
||||
*/
|
||||
public final class LowerCaseFilter extends TokenFilter {
|
||||
public class LowerCaseFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.CharArraySet;
|
|||
/**
|
||||
* Removes stop words from a token stream.
|
||||
*/
|
||||
public final class StopFilter extends FilteringTokenFilter {
|
||||
public class StopFilter extends FilteringTokenFilter {
|
||||
|
||||
private final CharArraySet stopWords;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
|
|
@ -19,7 +19,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
|
|
|
@ -278,11 +278,11 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
|
|||
assertNotNull("Expecting the 'StandardFilter' to be applied on the query for the 'text' field", tokenList);
|
||||
assertEquals("Query has only one token", 1, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1}, null, false));
|
||||
tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.LowerCaseFilter");
|
||||
tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.core.LowerCaseFilter");
|
||||
assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the query for the 'text' field", tokenList);
|
||||
assertEquals("Query has only one token", 1, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("jumping", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1,1}, null, false));
|
||||
tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.StopFilter");
|
||||
tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.core.StopFilter");
|
||||
assertNotNull("Expecting the 'StopFilter' to be applied on the query for the 'text' field", tokenList);
|
||||
assertEquals("Query has only one token", 1, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("jumping", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1,1,1}, null, false));
|
||||
|
@ -311,7 +311,7 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
|
|||
assertToken(tokenList.get(3), new TokenInfo("Over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4}, null, false));
|
||||
assertToken(tokenList.get(4), new TokenInfo("The", null, "<ALPHANUM>", 20, 23, 5, new int[]{5,5}, null, false));
|
||||
assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6}, null, false));
|
||||
tokenList = valueResult.get("org.apache.lucene.analysis.LowerCaseFilter");
|
||||
tokenList = valueResult.get("org.apache.lucene.analysis.core.LowerCaseFilter");
|
||||
assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the index for the 'text' field", tokenList);
|
||||
assertEquals("Expecting 6 tokens", 6, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("the", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1}, null, false));
|
||||
|
@ -320,7 +320,7 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
|
|||
assertToken(tokenList.get(3), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4}, null, false));
|
||||
assertToken(tokenList.get(4), new TokenInfo("the", null, "<ALPHANUM>", 20, 23, 5, new int[]{5,5,5}, null, false));
|
||||
assertToken(tokenList.get(5), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6}, null, false));
|
||||
tokenList = valueResult.get("org.apache.lucene.analysis.StopFilter");
|
||||
tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter");
|
||||
assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList);
|
||||
assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2}, null, false));
|
||||
|
|
|
@ -209,7 +209,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
|
|||
assertToken(tokenList.get(7), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8}, null, false));
|
||||
assertToken(tokenList.get(8), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9}, null, true));
|
||||
assertToken(tokenList.get(9), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10}, null, false));
|
||||
tokenList = indexPart.get("org.apache.lucene.analysis.LowerCaseFilter");
|
||||
tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
|
||||
assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList);
|
||||
assertEquals(tokenList.size(), 10);
|
||||
assertToken(tokenList.get(0), new TokenInfo("the", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1}, null, false));
|
||||
|
@ -222,7 +222,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
|
|||
assertToken(tokenList.get(7), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8}, null, false));
|
||||
assertToken(tokenList.get(8), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9}, null, true));
|
||||
assertToken(tokenList.get(9), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10}, null, false));
|
||||
tokenList = indexPart.get("org.apache.lucene.analysis.StopFilter");
|
||||
tokenList = indexPart.get("org.apache.lucene.analysis.core.StopFilter");
|
||||
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
|
||||
assertEquals(tokenList.size(), 8);
|
||||
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2}, null, false));
|
||||
|
@ -258,12 +258,12 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
|
|||
assertEquals(2, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1}, null, false));
|
||||
assertToken(tokenList.get(1), new TokenInfo("brown", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2}, null, false));
|
||||
tokenList = queryPart.get("org.apache.lucene.analysis.LowerCaseFilter");
|
||||
tokenList = queryPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
|
||||
assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList);
|
||||
assertEquals(2, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1}, null, false));
|
||||
assertToken(tokenList.get(1), new TokenInfo("brown", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2}, null, false));
|
||||
tokenList = queryPart.get("org.apache.lucene.analysis.StopFilter");
|
||||
tokenList = queryPart.get("org.apache.lucene.analysis.core.StopFilter");
|
||||
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
|
||||
assertEquals(2, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1,1}, null, false));
|
||||
|
@ -416,7 +416,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
|
|||
assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3}, null, false));
|
||||
assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4}, null, false));
|
||||
assertToken(tokenList.get(5), new TokenInfo("Test", null, "word", 14, 18, 5, new int[]{4,5}, null, false));
|
||||
tokenList = indexPart.get("org.apache.lucene.analysis.LowerCaseFilter");
|
||||
tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
|
||||
assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList);
|
||||
assertEquals(6, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1,1}, null, false));
|
||||
|
|
Loading…
Reference in New Issue