From 1d65537c75a1cdac9d17fb8c24786eb4ccf21b6c Mon Sep 17 00:00:00 2001
From: Daniel Naber <dnaber@apache.org>
Date: Sat, 5 Aug 2006 13:11:09 +0000
Subject: [PATCH] remove "s" and "t" as stopwords because they make searching
 less precise, e.g. "t-online" gives the same results as "online" with "t"
 being a stopword

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@428998 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                            |  9 +++++++++
 src/java/org/apache/lucene/analysis/StopAnalyzer.java  |  4 ++--
 .../apache/lucene/analysis/TestStandardAnalyzer.java   | 10 ++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index cde27796fc2..a8b3588ef6c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -4,6 +4,15 @@ $Id$
 
 Trunk (not yet released)
 
+Changes in runtime behavior
+
+ 1. 's' and 't' have been removed from the list of default stopwords
+    in StopAnalyzer (also used in by StandardAnalyzer). Having e.g. 's'
+    as a stopword meant that 's-class' led to the same results as 'class'.
+    Note that this problem still exists for 'a', e.g. in 'a-class' as
+    'a' continues to be a stopword.
+    (Daniel Naber)
+    
 New features
 
  1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers
diff --git a/src/java/org/apache/lucene/analysis/StopAnalyzer.java b/src/java/org/apache/lucene/analysis/StopAnalyzer.java
index 77455108aae..e84b847645d 100644
--- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java
@@ -31,8 +31,8 @@ public final class StopAnalyzer extends Analyzer {
   public static final String[] ENGLISH_STOP_WORDS = {
     "a", "an", "and", "are", "as", "at", "be", "but", "by",
     "for", "if", "in", "into", "is", "it",
-    "no", "not", "of", "on", "or", "s", "such",
-    "t", "that", "the", "their", "then", "there", "these",
+    "no", "not", "of", "on", "or", "such",
+    "that", "the", "their", "then", "there", "these",
     "they", "this", "to", "was", "will", "with"
   };
 
diff --git a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
index 1184c093948..496b073a684 100644
--- a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
+++ b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
@@ -55,8 +55,18 @@ public class TestStandardAnalyzer extends TestCase {
     // possessives are actually removed by StardardFilter, not the tokenizer
     assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
     assertAnalyzesTo(a, "you're", new String[]{"you're"});
+    assertAnalyzesTo(a, "she's", new String[]{"she"});
+    assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
+    assertAnalyzesTo(a, "don't", new String[]{"don't"});
     assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
 
+    // t and s had been stopwords in Lucene <= 2.0, which made it impossible
+    // to correctly search for these terms:
+    assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
+    assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
+    // 'a' is still a stopword:
+    assertAnalyzesTo(a, "a-class", new String[]{"class"});
+
     // company names
     assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
     assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});