LUCENE-1151: don't mis-identify HOST as ACRONYM, but, provide static method/property to revert to backwards-compatible but buggy behavior

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@618001 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2008-02-03 15:33:16 +00:00
parent afb58ca0d2
commit 0ec1870b11
3 changed files with 64 additions and 8 deletions

View File

@ -5,6 +5,19 @@ $Id$
Changes in runtime behavior Changes in runtime behavior
1. LUCENE-1151: Fix StandardAnalyzer to not mis-identify host names
(eg lucene.apache.org) as an ACRONYM. To get back to the pre-2.4
backwards compatible, but buggy, behavior, you can either call
StandardAnalyzer.setDefaultReplaceInvalidAcronym(false) (static
method), or, set system property
org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym
to "false" on JVM startup. All StandardAnalyzer instances created
after that will then show the pre-2.4 behavior. Alternatively,
you can call setReplaceInvalidAcronym(false) to change the
behavior per instance of StandardAnalyzer. This backwards
compatibility will be removed in 3.0 (hardwiring the value to
true). (Mike McCandless)
API Changes API Changes
1. LUCENE-1084: Changed all IndexWriter constructors to take an 1. LUCENE-1084: Changed all IndexWriter constructors to take an

View File

@ -41,7 +41,48 @@ public class StandardAnalyzer extends Analyzer {
* *
* See https://issues.apache.org/jira/browse/LUCENE-1068 * See https://issues.apache.org/jira/browse/LUCENE-1068
*/ */
private boolean replaceInvalidAcronym = false; private boolean replaceInvalidAcronym = defaultReplaceInvalidAcronym;
private static boolean defaultReplaceInvalidAcronym;
// Default to false (fixed the bug), unless the system prop is set
static {
final String v = System.getProperty("org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym");
if (v == null || v.equals("true"))
defaultReplaceInvalidAcronym = true;
else
defaultReplaceInvalidAcronym = false;
}
/**
*
* @return true if new instances of StandardTokenizer will
* replace mischaracterized acronyms
*
* See https://issues.apache.org/jira/browse/LUCENE-1068
* @deprecated This will be removed (hardwired to true) in 3.0
*/
public static boolean getDefaultReplaceInvalidAcronym() {
return defaultReplaceInvalidAcronym;
}
/**
*
* @param replaceInvalidAcronym Set to true to have new
* instances of StandardTokenizer replace mischaracterized
* acronyms by default. Set to false to preseve the
* previous (before 2.4) buggy behavior. Alternatively,
* set the system property
* org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym
* to false.
*
* See https://issues.apache.org/jira/browse/LUCENE-1068
* @deprecated This will be removed (hardwired to true) in 3.0
*/
public static void setDefaultReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
defaultReplaceInvalidAcronym = replaceInvalidAcronym;
}
/** An array containing some common English words that are usually not /** An array containing some common English words that are usually not
useful for searching. */ useful for searching. */
@ -204,6 +245,7 @@ public class StandardAnalyzer extends Analyzer {
* @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer * @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
* *
* See https://issues.apache.org/jira/browse/LUCENE-1068 * See https://issues.apache.org/jira/browse/LUCENE-1068
* @deprecated This will be removed (hardwired to true) in 3.0
*/ */
public boolean isReplaceInvalidAcronym() { public boolean isReplaceInvalidAcronym() {
return replaceInvalidAcronym; return replaceInvalidAcronym;
@ -214,6 +256,7 @@ public class StandardAnalyzer extends Analyzer {
* @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer * @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
* *
* See https://issues.apache.org/jira/browse/LUCENE-1068 * See https://issues.apache.org/jira/browse/LUCENE-1068
* @deprecated This will be removed (hardwired to true) in 3.0
*/ */
public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) { public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
this.replaceInvalidAcronym = replaceInvalidAcronym; this.replaceInvalidAcronym = replaceInvalidAcronym;

View File

@ -134,11 +134,11 @@ public class TestStandardAnalyzer extends LuceneTestCase {
// domain names // domain names
assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"}); assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"});
//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068. //Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
//TODO: Remove in 3.x // the following should be recognized as HOST:
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
// the following should be recognized as HOST. The code that sets replaceDepAcronym should be removed in the next release.
((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" }); assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
((StandardAnalyzer) a).setReplaceInvalidAcronym(false);
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
} }
public void testEMailAddresses() throws Exception { public void testEMailAddresses() throws Exception {
@ -247,6 +247,6 @@ public class TestStandardAnalyzer extends LuceneTestCase {
public void testDeprecatedAcronyms() throws Exception { public void testDeprecatedAcronyms() throws Exception {
// test backward compatibility for applications that require the old behavior. // test backward compatibility for applications that require the old behavior.
// this should be removed once replaceDepAcronym is removed. // this should be removed once replaceDepAcronym is removed.
assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "luceneapacheorg" }, new String[] { "<ACRONYM>" }); assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "lucene.apache.org" }, new String[] { "<HOST>" });
} }
} }