mirror of https://github.com/apache/lucene.git
remove "s" and "t" as stopwords because they make searching less precise, e.g. "t-online" gives the same results as "online" with "t" being a stopword
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@428998 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a9a325a4df
commit
1d65537c75
|
@ -4,6 +4,15 @@ $Id$
|
|||
|
||||
Trunk (not yet released)
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
1. 's' and 't' have been removed from the list of default stopwords
|
||||
in StopAnalyzer (also used in by StandardAnalyzer). Having e.g. 's'
|
||||
as a stopword meant that 's-class' led to the same results as 'class'.
|
||||
Note that this problem still exists for 'a', e.g. in 'a-class' as
|
||||
'a' continues to be a stopword.
|
||||
(Daniel Naber)
|
||||
|
||||
New features
|
||||
|
||||
1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers
|
||||
|
|
|
@ -31,8 +31,8 @@ public final class StopAnalyzer extends Analyzer {
|
|||
public static final String[] ENGLISH_STOP_WORDS = {
|
||||
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it",
|
||||
"no", "not", "of", "on", "or", "s", "such",
|
||||
"t", "that", "the", "their", "then", "there", "these",
|
||||
"no", "not", "of", "on", "or", "such",
|
||||
"that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
};
|
||||
|
||||
|
|
|
@ -55,8 +55,18 @@ public class TestStandardAnalyzer extends TestCase {
|
|||
// possessives are actually removed by StardardFilter, not the tokenizer
|
||||
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
|
||||
assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
||||
assertAnalyzesTo(a, "she's", new String[]{"she"});
|
||||
assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
|
||||
assertAnalyzesTo(a, "don't", new String[]{"don't"});
|
||||
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
|
||||
|
||||
// t and s had been stopwords in Lucene <= 2.0, which made it impossible
|
||||
// to correctly search for these terms:
|
||||
assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
|
||||
assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
|
||||
// 'a' is still a stopword:
|
||||
assertAnalyzesTo(a, "a-class", new String[]{"class"});
|
||||
|
||||
// company names
|
||||
assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
|
||||
assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
|
||||
|
|
Loading…
Reference in New Issue