From 1d65537c75a1cdac9d17fb8c24786eb4ccf21b6c Mon Sep 17 00:00:00 2001 From: Daniel Naber Date: Sat, 5 Aug 2006 13:11:09 +0000 Subject: [PATCH] remove "s" and "t" as stopwords because they make searching less precise, e.g. "t-online" gives the same results as "online" with "t" being a stopword git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@428998 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 9 +++++++++ src/java/org/apache/lucene/analysis/StopAnalyzer.java | 4 ++-- .../apache/lucene/analysis/TestStandardAnalyzer.java | 10 ++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index cde27796fc2..a8b3588ef6c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -4,6 +4,15 @@ $Id$ Trunk (not yet released) +Changes in runtime behavior + + 1. 's' and 't' have been removed from the list of default stopwords + in StopAnalyzer (also used in by StandardAnalyzer). Having e.g. 's' + as a stopword meant that 's-class' led to the same results as 'class'. + Note that this problem still exists for 'a', e.g. in 'a-class' as + 'a' continues to be a stopword. + (Daniel Naber) + New features 1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers diff --git a/src/java/org/apache/lucene/analysis/StopAnalyzer.java b/src/java/org/apache/lucene/analysis/StopAnalyzer.java index 77455108aae..e84b847645d 100644 --- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java @@ -31,8 +31,8 @@ public final class StopAnalyzer extends Analyzer { public static final String[] ENGLISH_STOP_WORDS = { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", - "no", "not", "of", "on", "or", "s", "such", - "t", "that", "the", "their", "then", "there", "these", + "no", "not", "of", "on", "or", "such", + "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; diff --git a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java index 1184c093948..496b073a684 100644 --- a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java +++ b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java @@ -55,8 +55,18 @@ public class TestStandardAnalyzer extends TestCase { // possessives are actually removed by StardardFilter, not the tokenizer assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); assertAnalyzesTo(a, "you're", new String[]{"you're"}); + assertAnalyzesTo(a, "she's", new String[]{"she"}); + assertAnalyzesTo(a, "Jim's", new String[]{"jim"}); + assertAnalyzesTo(a, "don't", new String[]{"don't"}); assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"}); + // t and s had been stopwords in Lucene <= 2.0, which made it impossible + // to correctly search for these terms: + assertAnalyzesTo(a, "s-class", new String[]{"s", "class"}); + assertAnalyzesTo(a, "t-com", new String[]{"t", "com"}); + // 'a' is still a stopword: + assertAnalyzesTo(a, "a-class", new String[]{"class"}); + // company names assertAnalyzesTo(a, "AT&T", new String[]{"at&t"}); assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});