mirror of https://github.com/apache/lucene.git
LUCENE-1151: don't mis-identify HOST as ACRONYM, but, provide static method/property to revert to backwards-compatible but buggy behavior
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@618001 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
afb58ca0d2
commit
0ec1870b11
13
CHANGES.txt
13
CHANGES.txt
|
@ -5,6 +5,19 @@ $Id$
|
|||
|
||||
Changes in runtime behavior
|
||||
|
||||
1. LUCENE-1151: Fix StandardAnalyzer to not mis-identify host names
|
||||
(eg lucene.apache.org) as an ACRONYM. To get back to the pre-2.4
|
||||
backwards compatible, but buggy, behavior, you can either call
|
||||
StandardAnalyzer.setDefaultReplaceInvalidAcronym(false) (static
|
||||
method), or, set system property
|
||||
org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym
|
||||
to "false" on JVM startup. All StandardAnalyzer instances created
|
||||
after that will then show the pre-2.4 behavior. Alternatively,
|
||||
you can call setReplaceInvalidAcronym(false) to change the
|
||||
behavior per instance of StandardAnalyzer. This backwards
|
||||
compatibility will be removed in 3.0 (hardwiring the value to
|
||||
true). (Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
1. LUCENE-1084: Changed all IndexWriter constructors to take an
|
||||
|
|
|
@ -41,8 +41,49 @@ public class StandardAnalyzer extends Analyzer {
|
|||
*
|
||||
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||
*/
|
||||
private boolean replaceInvalidAcronym = false;
|
||||
|
||||
private boolean replaceInvalidAcronym = defaultReplaceInvalidAcronym;
|
||||
|
||||
private static boolean defaultReplaceInvalidAcronym;
|
||||
|
||||
// Default to false (fixed the bug), unless the system prop is set
|
||||
static {
|
||||
final String v = System.getProperty("org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym");
|
||||
if (v == null || v.equals("true"))
|
||||
defaultReplaceInvalidAcronym = true;
|
||||
else
|
||||
defaultReplaceInvalidAcronym = false;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return true if new instances of StandardTokenizer will
|
||||
* replace mischaracterized acronyms
|
||||
*
|
||||
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||
* @deprecated This will be removed (hardwired to true) in 3.0
|
||||
*/
|
||||
public static boolean getDefaultReplaceInvalidAcronym() {
|
||||
return defaultReplaceInvalidAcronym;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param replaceInvalidAcronym Set to true to have new
|
||||
* instances of StandardTokenizer replace mischaracterized
|
||||
* acronyms by default. Set to false to preseve the
|
||||
* previous (before 2.4) buggy behavior. Alternatively,
|
||||
* set the system property
|
||||
* org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym
|
||||
* to false.
|
||||
*
|
||||
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||
* @deprecated This will be removed (hardwired to true) in 3.0
|
||||
*/
|
||||
public static void setDefaultReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
|
||||
defaultReplaceInvalidAcronym = replaceInvalidAcronym;
|
||||
}
|
||||
|
||||
|
||||
/** An array containing some common English words that are usually not
|
||||
useful for searching. */
|
||||
public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
|
||||
|
@ -204,6 +245,7 @@ public class StandardAnalyzer extends Analyzer {
|
|||
* @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
|
||||
*
|
||||
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||
* @deprecated This will be removed (hardwired to true) in 3.0
|
||||
*/
|
||||
public boolean isReplaceInvalidAcronym() {
|
||||
return replaceInvalidAcronym;
|
||||
|
@ -214,6 +256,7 @@ public class StandardAnalyzer extends Analyzer {
|
|||
* @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
|
||||
*
|
||||
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||
* @deprecated This will be removed (hardwired to true) in 3.0
|
||||
*/
|
||||
public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
|
||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||
|
|
|
@ -134,11 +134,11 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
|||
// domain names
|
||||
assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"});
|
||||
//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
|
||||
//TODO: Remove in 3.x
|
||||
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
|
||||
// the following should be recognized as HOST. The code that sets replaceDepAcronym should be removed in the next release.
|
||||
((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
|
||||
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||
// the following should be recognized as HOST:
|
||||
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||
((StandardAnalyzer) a).setReplaceInvalidAcronym(false);
|
||||
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
|
||||
((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
|
||||
}
|
||||
|
||||
public void testEMailAddresses() throws Exception {
|
||||
|
@ -247,6 +247,6 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
|||
public void testDeprecatedAcronyms() throws Exception {
|
||||
// test backward compatibility for applications that require the old behavior.
|
||||
// this should be removed once replaceDepAcronym is removed.
|
||||
assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "luceneapacheorg" }, new String[] { "<ACRONYM>" });
|
||||
assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "lucene.apache.org" }, new String[] { "<HOST>" });
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue