diff --git a/CHANGES.txt b/CHANGES.txt index 9447a5ea8a6..7c9b7f0e093 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -5,6 +5,19 @@ $Id$ Changes in runtime behavior + 1. LUCENE-1151: Fix StandardAnalyzer to not mis-identify host names + (eg lucene.apache.org) as an ACRONYM. To get back to the pre-2.4 + backwards compatible, but buggy, behavior, you can either call + StandardAnalyzer.setDefaultReplaceInvalidAcronym(false) (static + method), or, set system property + org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym + to "false" on JVM startup. All StandardAnalyzer instances created + after that will then show the pre-2.4 behavior. Alternatively, + you can call setReplaceInvalidAcronym(false) to change the + behavior per instance of StandardAnalyzer. This backwards + compatibility will be removed in 3.0 (hardwiring the value to + true). (Mike McCandless) + API Changes 1. LUCENE-1084: Changed all IndexWriter constructors to take an diff --git a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java index 42daad14934..0ed1289d341 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java @@ -41,8 +41,49 @@ public class StandardAnalyzer extends Analyzer { * * See https://issues.apache.org/jira/browse/LUCENE-1068 */ - private boolean replaceInvalidAcronym = false; - + private boolean replaceInvalidAcronym = defaultReplaceInvalidAcronym; + + private static boolean defaultReplaceInvalidAcronym; + + // Default to false (fixed the bug), unless the system prop is set + static { + final String v = System.getProperty("org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym"); + if (v == null || v.equals("true")) + defaultReplaceInvalidAcronym = true; + else + defaultReplaceInvalidAcronym = false; + } + + /** + * + * @return true if new instances of StandardTokenizer will + * replace mischaracterized acronyms + * + * See https://issues.apache.org/jira/browse/LUCENE-1068 + * @deprecated This will be removed (hardwired to true) in 3.0 + */ + public static boolean getDefaultReplaceInvalidAcronym() { + return defaultReplaceInvalidAcronym; + } + + /** + * + * @param replaceInvalidAcronym Set to true to have new + * instances of StandardTokenizer replace mischaracterized + * acronyms by default. Set to false to preseve the + * previous (before 2.4) buggy behavior. Alternatively, + * set the system property + * org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym + * to false. + * + * See https://issues.apache.org/jira/browse/LUCENE-1068 + * @deprecated This will be removed (hardwired to true) in 3.0 + */ + public static void setDefaultReplaceInvalidAcronym(boolean replaceInvalidAcronym) { + defaultReplaceInvalidAcronym = replaceInvalidAcronym; + } + + /** An array containing some common English words that are usually not useful for searching. */ public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS; @@ -204,6 +245,7 @@ public class StandardAnalyzer extends Analyzer { * @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer * * See https://issues.apache.org/jira/browse/LUCENE-1068 + * @deprecated This will be removed (hardwired to true) in 3.0 */ public boolean isReplaceInvalidAcronym() { return replaceInvalidAcronym; @@ -214,6 +256,7 @@ public class StandardAnalyzer extends Analyzer { * @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer * * See https://issues.apache.org/jira/browse/LUCENE-1068 + * @deprecated This will be removed (hardwired to true) in 3.0 */ public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) { this.replaceInvalidAcronym = replaceInvalidAcronym; diff --git a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java index de43d745607..9f60e820c1d 100644 --- a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java +++ b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java @@ -134,11 +134,11 @@ public class TestStandardAnalyzer extends LuceneTestCase { // domain names assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"}); //Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068. - //TODO: Remove in 3.x - assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "" }); - // the following should be recognized as HOST. The code that sets replaceDepAcronym should be removed in the next release. - ((StandardAnalyzer) a).setReplaceInvalidAcronym(true); - assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "" }); + // the following should be recognized as HOST: + assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "" }); + ((StandardAnalyzer) a).setReplaceInvalidAcronym(false); + assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "" }); + ((StandardAnalyzer) a).setReplaceInvalidAcronym(true); } public void testEMailAddresses() throws Exception { @@ -247,6 +247,6 @@ public class TestStandardAnalyzer extends LuceneTestCase { public void testDeprecatedAcronyms() throws Exception { // test backward compatibility for applications that require the old behavior. // this should be removed once replaceDepAcronym is removed. - assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "luceneapacheorg" }, new String[] { "" }); + assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "lucene.apache.org" }, new String[] { "" }); } }