remove "s" and "t" as stopwords because they make searching less precise, e.g. "t-online" gives the same results as "online" with "t" being a stopword

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@428998 13f79535-47bb-0310-9956-ffa450edef68
2006-08-05 13:11:09 +00:00 · 2006-08-05 13:11:09 +00:00 · 1d65537c75
parent a9a325a4df
commit 1d65537c75
3 changed files with 21 additions and 2 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -4,6 +4,15 @@ $Id$

 Trunk (not yet released)

+Changes in runtime behavior
+
+ 1. 's' and 't' have been removed from the list of default stopwords
+    in StopAnalyzer (also used in by StandardAnalyzer). Having e.g. 's'
+    as a stopword meant that 's-class' led to the same results as 'class'.
+    Note that this problem still exists for 'a', e.g. in 'a-class' as
+    'a' continues to be a stopword.
+    (Daniel Naber)
+    
 New features

 1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers
--- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java
@ -31,8 +31,8 @@ public final class StopAnalyzer extends Analyzer {
  public static final String[] ENGLISH_STOP_WORDS = {
    "a", "an", "and", "are", "as", "at", "be", "but", "by",
    "for", "if", "in", "into", "is", "it",
-    "no", "not", "of", "on", "or", "s", "such",
-    "t", "that", "the", "their", "then", "there", "these",
+    "no", "not", "of", "on", "or", "such",
+    "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
  };

--- a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
+++ b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
@ -55,8 +55,18 @@ public class TestStandardAnalyzer extends TestCase {
    // possessives are actually removed by StardardFilter, not the tokenizer
    assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
    assertAnalyzesTo(a, "you're", new String[]{"you're"});
+    assertAnalyzesTo(a, "she's", new String[]{"she"});
+    assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
+    assertAnalyzesTo(a, "don't", new String[]{"don't"});
    assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});

+    // t and s had been stopwords in Lucene <= 2.0, which made it impossible
+    // to correctly search for these terms:
+    assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
+    assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
+    // 'a' is still a stopword:
+    assertAnalyzesTo(a, "a-class", new String[]{"class"});
+
    // company names
    assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
    assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});