LUCENE-1684: add matchVersion to StandardAnalyzer, and improve defaults if version is 2.9+

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@784984 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-06-15 21:19:03 +00:00
parent 6ed703e655
commit 757795bffe
2 changed files with 128 additions and 47 deletions

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.standard;
*/ */
import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.*;
import org.apache.lucene.util.Version;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -26,12 +27,24 @@ import java.util.Set;
/** /**
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
* LowerCaseFilter} and {@link StopFilter}, using a list of English stop words. * LowerCaseFilter} and {@link StopFilter}, using a list of
* English stop words.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StandardAnalyzer:
* <ul>
* <li> As of 2.9, StopFilter preserves position
* increments by default
* <li> As of 2.9, Tokens incorrectly identified as acronyms
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>
* </ul>
* *
* @version $Id$ * @version $Id$
*/ */
public class StandardAnalyzer extends Analyzer { public class StandardAnalyzer extends Analyzer {
private Set stopSet; private Set stopSet;
private Version matchVersion;
/** /**
* Specifies whether deprecated acronyms should be replaced with HOST type. * Specifies whether deprecated acronyms should be replaced with HOST type.
@ -94,87 +107,92 @@ public class StandardAnalyzer extends Analyzer {
/** Builds an analyzer with the default stop words ({@link /** Builds an analyzer with the default stop words ({@link
* #STOP_WORDS}). * #STOP_WORDS}).
* @deprecated Use {@link #StandardAnalyzer(boolean, String[])}, * @deprecated Use {@link #StandardAnalyzer(Version)},
* passing in null for the stop words, instead */ * instead. */
public StandardAnalyzer() { public StandardAnalyzer() {
this(STOP_WORDS); this(Version.LUCENE_24, STOP_WORDS);
}
/** Builds an analyzer with the default stop words ({@link
* #STOP_WORDS}).
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
*/
public StandardAnalyzer(Version matchVersion) {
this(matchVersion, STOP_WORDS);
} }
/** Builds an analyzer with the given stop words. /** Builds an analyzer with the given stop words.
* @deprecated Use {@link #StandardAnalyzer(boolean, Set)} * @deprecated Use {@link #StandardAnalyzer(Version, Set)}
* instead */ * instead */
public StandardAnalyzer(Set stopWords) { public StandardAnalyzer(Set stopWords) {
stopSet = stopWords; this(Version.LUCENE_24, stopWords);
useDefaultStopPositionIncrements = true;
} }
/** Builds an analyzer with the given stop words. /** Builds an analyzer with the given stop words.
* @param enableStopPositionIncrements See {@link * @param matchVersion Lucene version to match See {@link
* StopFilter#setEnablePositionIncrements} * <a href="#version">above</a>}
* @param stopWords stop words */ * @param stopWords stop words */
public StandardAnalyzer(boolean enableStopPositionIncrements, Set stopWords) { public StandardAnalyzer(Version matchVersion, Set stopWords) {
stopSet = stopWords; stopSet = stopWords;
this.enableStopPositionIncrements = enableStopPositionIncrements; init(matchVersion);
} }
/** Builds an analyzer with the given stop words. /** Builds an analyzer with the given stop words.
* @deprecated Use {@link #StandardAnalyzer(boolean, * @deprecated Use {@link #StandardAnalyzer(Version,
* String[])} instead */ * String[])} instead */
public StandardAnalyzer(String[] stopWords) { public StandardAnalyzer(String[] stopWords) {
this(Version.LUCENE_24, stopWords);
}
/** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopWords Array of stop words */
public StandardAnalyzer(Version matchVersion, String[] stopWords) {
if (stopWords == null) { if (stopWords == null) {
stopWords = STOP_WORDS; stopWords = STOP_WORDS;
} }
stopSet = StopFilter.makeStopSet(stopWords); stopSet = StopFilter.makeStopSet(stopWords);
useDefaultStopPositionIncrements = true; init(matchVersion);
}
/** Builds an analyzer with the given stop words.
* @param enableStopPositionIncrements See {@link
* StopFilter#setEnablePositionIncrements}
* @param stopWords Array of stop words */
public StandardAnalyzer(boolean enableStopPositionIncrements, String[] stopWords) {
stopSet = StopFilter.makeStopSet(stopWords);
this.enableStopPositionIncrements = enableStopPositionIncrements;
} }
/** Builds an analyzer with the stop words from the given file. /** Builds an analyzer with the stop words from the given file.
* @see WordlistLoader#getWordSet(File) * @see WordlistLoader#getWordSet(File)
* @deprecated Use {@link #StandardAnalyzer(boolean, File)} * @deprecated Use {@link #StandardAnalyzer(Version, File)}
* instead * instead
*/ */
public StandardAnalyzer(File stopwords) throws IOException { public StandardAnalyzer(File stopwords) throws IOException {
stopSet = WordlistLoader.getWordSet(stopwords); this(Version.LUCENE_24, stopwords);
useDefaultStopPositionIncrements = true;
} }
/** Builds an analyzer with the stop words from the given file. /** Builds an analyzer with the stop words from the given file.
* @see WordlistLoader#getWordSet(File) * @see WordlistLoader#getWordSet(File)
* @param enableStopPositionIncrements See {@link * @param matchVersion Lucene version to match See {@link
* StopFilter#setEnablePositionIncrements} * <a href="#version">above</a>}
* @param stopwords File to read stop words from */ * @param stopwords File to read stop words from */
public StandardAnalyzer(boolean enableStopPositionIncrements, File stopwords) throws IOException { public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
stopSet = WordlistLoader.getWordSet(stopwords); stopSet = WordlistLoader.getWordSet(stopwords);
this.enableStopPositionIncrements = enableStopPositionIncrements; init(matchVersion);
} }
/** Builds an analyzer with the stop words from the given reader. /** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader) * @see WordlistLoader#getWordSet(Reader)
* @deprecated Use {@link #StandardAnalyzer(boolean, Reader)} * @deprecated Use {@link #StandardAnalyzer(Version, Reader)}
* instead * instead
*/ */
public StandardAnalyzer(Reader stopwords) throws IOException { public StandardAnalyzer(Reader stopwords) throws IOException {
stopSet = WordlistLoader.getWordSet(stopwords); this(Version.LUCENE_24, stopwords);
useDefaultStopPositionIncrements = true;
} }
/** Builds an analyzer with the stop words from the given reader. /** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader) * @see WordlistLoader#getWordSet(Reader)
* @param enableStopPositionIncrements See {@link * @param matchVersion Lucene version to match See {@link
* StopFilter#setEnablePositionIncrements} * <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */ * @param stopwords Reader to read stop words from */
public StandardAnalyzer(boolean enableStopPositionIncrements, Reader stopwords) throws IOException { public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
stopSet = WordlistLoader.getWordSet(stopwords); stopSet = WordlistLoader.getWordSet(stopwords);
this.enableStopPositionIncrements = enableStopPositionIncrements; init(matchVersion);
} }
/** /**
@ -186,9 +204,8 @@ public class StandardAnalyzer extends Analyzer {
* @deprecated Remove in 3.X and make true the only valid value * @deprecated Remove in 3.X and make true the only valid value
*/ */
public StandardAnalyzer(boolean replaceInvalidAcronym) { public StandardAnalyzer(boolean replaceInvalidAcronym) {
this(STOP_WORDS); this(Version.LUCENE_24, STOP_WORDS);
this.replaceInvalidAcronym = replaceInvalidAcronym; this.replaceInvalidAcronym = replaceInvalidAcronym;
useDefaultStopPositionIncrements = true;
} }
/** /**
@ -200,9 +217,8 @@ public class StandardAnalyzer extends Analyzer {
* @deprecated Remove in 3.X and make true the only valid value * @deprecated Remove in 3.X and make true the only valid value
*/ */
public StandardAnalyzer(Reader stopwords, boolean replaceInvalidAcronym) throws IOException{ public StandardAnalyzer(Reader stopwords, boolean replaceInvalidAcronym) throws IOException{
this(stopwords); this(Version.LUCENE_24, stopwords);
this.replaceInvalidAcronym = replaceInvalidAcronym; this.replaceInvalidAcronym = replaceInvalidAcronym;
useDefaultStopPositionIncrements = true;
} }
/** /**
@ -214,9 +230,8 @@ public class StandardAnalyzer extends Analyzer {
* @deprecated Remove in 3.X and make true the only valid value * @deprecated Remove in 3.X and make true the only valid value
*/ */
public StandardAnalyzer(File stopwords, boolean replaceInvalidAcronym) throws IOException{ public StandardAnalyzer(File stopwords, boolean replaceInvalidAcronym) throws IOException{
this(stopwords); this(Version.LUCENE_24, stopwords);
this.replaceInvalidAcronym = replaceInvalidAcronym; this.replaceInvalidAcronym = replaceInvalidAcronym;
useDefaultStopPositionIncrements = true;
} }
/** /**
@ -229,9 +244,8 @@ public class StandardAnalyzer extends Analyzer {
* @deprecated Remove in 3.X and make true the only valid value * @deprecated Remove in 3.X and make true the only valid value
*/ */
public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{ public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{
this(stopwords); this(Version.LUCENE_24, stopwords);
this.replaceInvalidAcronym = replaceInvalidAcronym; this.replaceInvalidAcronym = replaceInvalidAcronym;
useDefaultStopPositionIncrements = true;
} }
/** /**
@ -243,10 +257,18 @@ public class StandardAnalyzer extends Analyzer {
* @deprecated Remove in 3.X and make true the only valid value * @deprecated Remove in 3.X and make true the only valid value
*/ */
public StandardAnalyzer(Set stopwords, boolean replaceInvalidAcronym) throws IOException{ public StandardAnalyzer(Set stopwords, boolean replaceInvalidAcronym) throws IOException{
this(stopwords); this(Version.LUCENE_24, stopwords);
this.replaceInvalidAcronym = replaceInvalidAcronym; this.replaceInvalidAcronym = replaceInvalidAcronym;
}
private final void init(Version matchVersion) {
this.matchVersion = matchVersion;
if (matchVersion.onOrAfter(Version.LUCENE_29)) {
enableStopPositionIncrements = true;
} else {
useDefaultStopPositionIncrements = true; useDefaultStopPositionIncrements = true;
} }
}
/** Constructs a {@link StandardTokenizer} filtered by a {@link /** Constructs a {@link StandardTokenizer} filtered by a {@link
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
@ -290,6 +312,7 @@ public class StandardAnalyzer extends Analyzer {
return maxTokenLength; return maxTokenLength;
} }
/** @deprecated Use {@link #tokenStream} instead */
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream(); SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) { if (streams == null) {

View File

@ -0,0 +1,58 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
/**
* Use by certain classes to match version compatibility
* across releases of Lucene.
*/
public final class Version extends Parameter implements Serializable {
/** Use this to get the latest & greatest settings, bug
* fixes, etc, for Lucene.
*
* <p><b>WARNING</b>: if you use this setting, and then
* upgrade to a newer release of Lucene, sizable changes
* may happen. If precise back compatibility is important
* then you should instead explicitly specify an actual
* version.
*/
public static final Version LUCENE_CURRENT = new Version("LUCENE_CURRENT", 0);
/** Match settings and bugs in Lucene's 2.4 release.
* @deprecated This will be removed in 3.0 */
public static final Version LUCENE_24 = new Version("LUCENE_24", 2400);
/** Match settings and bugs in Lucene's 2.9 release.
* @deprecated This will be removed in 3.0 */
public static final Version LUCENE_29 = new Version("LUCENE_29", 2900);
private final int v;
public Version(String name, int v) {
super(name);
this.v = v;
}
public boolean onOrAfter(Version other) {
return v == 0 || v >= other.v;
}
}