mirror of https://github.com/apache/lucene.git
LUCENE-1151: don't mis-identify HOST as ACRONYM, but, provide static method/property to revert to backwards-compatible but buggy behavior
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@618001 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
afb58ca0d2
commit
0ec1870b11
13
CHANGES.txt
13
CHANGES.txt
|
@ -5,6 +5,19 @@ $Id$
|
||||||
|
|
||||||
Changes in runtime behavior
|
Changes in runtime behavior
|
||||||
|
|
||||||
|
1. LUCENE-1151: Fix StandardAnalyzer to not mis-identify host names
|
||||||
|
(eg lucene.apache.org) as an ACRONYM. To get back to the pre-2.4
|
||||||
|
backwards compatible, but buggy, behavior, you can either call
|
||||||
|
StandardAnalyzer.setDefaultReplaceInvalidAcronym(false) (static
|
||||||
|
method), or, set system property
|
||||||
|
org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym
|
||||||
|
to "false" on JVM startup. All StandardAnalyzer instances created
|
||||||
|
after that will then show the pre-2.4 behavior. Alternatively,
|
||||||
|
you can call setReplaceInvalidAcronym(false) to change the
|
||||||
|
behavior per instance of StandardAnalyzer. This backwards
|
||||||
|
compatibility will be removed in 3.0 (hardwiring the value to
|
||||||
|
true). (Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
1. LUCENE-1084: Changed all IndexWriter constructors to take an
|
1. LUCENE-1084: Changed all IndexWriter constructors to take an
|
||||||
|
|
|
@ -41,8 +41,49 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
*
|
*
|
||||||
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
*/
|
*/
|
||||||
private boolean replaceInvalidAcronym = false;
|
private boolean replaceInvalidAcronym = defaultReplaceInvalidAcronym;
|
||||||
|
|
||||||
|
private static boolean defaultReplaceInvalidAcronym;
|
||||||
|
|
||||||
|
// Default to false (fixed the bug), unless the system prop is set
|
||||||
|
static {
|
||||||
|
final String v = System.getProperty("org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym");
|
||||||
|
if (v == null || v.equals("true"))
|
||||||
|
defaultReplaceInvalidAcronym = true;
|
||||||
|
else
|
||||||
|
defaultReplaceInvalidAcronym = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return true if new instances of StandardTokenizer will
|
||||||
|
* replace mischaracterized acronyms
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
* @deprecated This will be removed (hardwired to true) in 3.0
|
||||||
|
*/
|
||||||
|
public static boolean getDefaultReplaceInvalidAcronym() {
|
||||||
|
return defaultReplaceInvalidAcronym;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param replaceInvalidAcronym Set to true to have new
|
||||||
|
* instances of StandardTokenizer replace mischaracterized
|
||||||
|
* acronyms by default. Set to false to preseve the
|
||||||
|
* previous (before 2.4) buggy behavior. Alternatively,
|
||||||
|
* set the system property
|
||||||
|
* org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym
|
||||||
|
* to false.
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
* @deprecated This will be removed (hardwired to true) in 3.0
|
||||||
|
*/
|
||||||
|
public static void setDefaultReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
|
||||||
|
defaultReplaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** An array containing some common English words that are usually not
|
/** An array containing some common English words that are usually not
|
||||||
useful for searching. */
|
useful for searching. */
|
||||||
public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
|
public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
|
||||||
|
@ -204,6 +245,7 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
* @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
|
* @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
|
||||||
*
|
*
|
||||||
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
* @deprecated This will be removed (hardwired to true) in 3.0
|
||||||
*/
|
*/
|
||||||
public boolean isReplaceInvalidAcronym() {
|
public boolean isReplaceInvalidAcronym() {
|
||||||
return replaceInvalidAcronym;
|
return replaceInvalidAcronym;
|
||||||
|
@ -214,6 +256,7 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
* @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
|
* @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
|
||||||
*
|
*
|
||||||
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
* @deprecated This will be removed (hardwired to true) in 3.0
|
||||||
*/
|
*/
|
||||||
public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
|
public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
|
||||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
|
|
|
@ -134,11 +134,11 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
||||||
// domain names
|
// domain names
|
||||||
assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"});
|
assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"});
|
||||||
//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
|
//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
|
||||||
//TODO: Remove in 3.x
|
// the following should be recognized as HOST:
|
||||||
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
|
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||||
// the following should be recognized as HOST. The code that sets replaceDepAcronym should be removed in the next release.
|
((StandardAnalyzer) a).setReplaceInvalidAcronym(false);
|
||||||
((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
|
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
|
||||||
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEMailAddresses() throws Exception {
|
public void testEMailAddresses() throws Exception {
|
||||||
|
@ -247,6 +247,6 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
||||||
public void testDeprecatedAcronyms() throws Exception {
|
public void testDeprecatedAcronyms() throws Exception {
|
||||||
// test backward compatibility for applications that require the old behavior.
|
// test backward compatibility for applications that require the old behavior.
|
||||||
// this should be removed once replaceDepAcronym is removed.
|
// this should be removed once replaceDepAcronym is removed.
|
||||||
assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "luceneapacheorg" }, new String[] { "<ACRONYM>" });
|
assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "lucene.apache.org" }, new String[] { "<HOST>" });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue