mirror of https://github.com/apache/lucene.git
remove "s" and "t" as stopwords because they make searching less precise, e.g. "t-online" gives the same results as "online" with "t" being a stopword
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@428998 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a9a325a4df
commit
1d65537c75
|
@ -4,6 +4,15 @@ $Id$
|
||||||
|
|
||||||
Trunk (not yet released)
|
Trunk (not yet released)
|
||||||
|
|
||||||
|
Changes in runtime behavior
|
||||||
|
|
||||||
|
1. 's' and 't' have been removed from the list of default stopwords
|
||||||
|
in StopAnalyzer (also used in by StandardAnalyzer). Having e.g. 's'
|
||||||
|
as a stopword meant that 's-class' led to the same results as 'class'.
|
||||||
|
Note that this problem still exists for 'a', e.g. in 'a-class' as
|
||||||
|
'a' continues to be a stopword.
|
||||||
|
(Daniel Naber)
|
||||||
|
|
||||||
New features
|
New features
|
||||||
|
|
||||||
1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers
|
1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers
|
||||||
|
|
|
@ -31,8 +31,8 @@ public final class StopAnalyzer extends Analyzer {
|
||||||
public static final String[] ENGLISH_STOP_WORDS = {
|
public static final String[] ENGLISH_STOP_WORDS = {
|
||||||
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
||||||
"for", "if", "in", "into", "is", "it",
|
"for", "if", "in", "into", "is", "it",
|
||||||
"no", "not", "of", "on", "or", "s", "such",
|
"no", "not", "of", "on", "or", "such",
|
||||||
"t", "that", "the", "their", "then", "there", "these",
|
"that", "the", "their", "then", "there", "these",
|
||||||
"they", "this", "to", "was", "will", "with"
|
"they", "this", "to", "was", "will", "with"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -55,8 +55,18 @@ public class TestStandardAnalyzer extends TestCase {
|
||||||
// possessives are actually removed by StardardFilter, not the tokenizer
|
// possessives are actually removed by StardardFilter, not the tokenizer
|
||||||
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
|
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
|
||||||
assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
||||||
|
assertAnalyzesTo(a, "she's", new String[]{"she"});
|
||||||
|
assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
|
||||||
|
assertAnalyzesTo(a, "don't", new String[]{"don't"});
|
||||||
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
|
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
|
||||||
|
|
||||||
|
// t and s had been stopwords in Lucene <= 2.0, which made it impossible
|
||||||
|
// to correctly search for these terms:
|
||||||
|
assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
|
||||||
|
assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
|
||||||
|
// 'a' is still a stopword:
|
||||||
|
assertAnalyzesTo(a, "a-class", new String[]{"class"});
|
||||||
|
|
||||||
// company names
|
// company names
|
||||||
assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
|
assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
|
||||||
assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
|
assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
|
||||||
|
|
Loading…
Reference in New Issue