mirror of https://github.com/apache/lucene.git
LUCENE-3410: Deprecated multi-int constructors in WordDelimiterFilter. Now uses int bitfield
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1165995 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9e1960350e
commit
4b44bd7d83
|
@ -34,6 +34,10 @@ API Changes
|
|||
|
||||
* LUCENE-3400: Removed DutchAnalyzer.setStemDictionary (Chris Male)
|
||||
|
||||
* LUCENE-3410: Deprecated the WordDelimterFilter constructors accepting multiple
|
||||
ints masquerading as booleans. Preferred constructor now accepts a single int
|
||||
bitfield (Chris Male)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
|
||||
|
|
|
@ -72,46 +72,65 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
public static final int ALPHANUM = 0x07;
|
||||
|
||||
/**
|
||||
* If true, causes parts of words to be generated:
|
||||
* Causes parts of words to be generated:
|
||||
* <p/>
|
||||
* "PowerShot" => "Power" "Shot"
|
||||
*/
|
||||
final boolean generateWordParts;
|
||||
public static final int GENERATE_WORD_PARTS = 1;
|
||||
|
||||
/**
|
||||
* If true, causes number subwords to be generated:
|
||||
* Causes number subwords to be generated:
|
||||
* <p/>
|
||||
* "500-42" => "500" "42"
|
||||
*/
|
||||
final boolean generateNumberParts;
|
||||
public static final int GENERATE_NUMBER_PARTS = 2;
|
||||
|
||||
/**
|
||||
* If true, causes maximum runs of word parts to be catenated:
|
||||
* Causes maximum runs of word parts to be catenated:
|
||||
* <p/>
|
||||
* "wi-fi" => "wifi"
|
||||
*/
|
||||
final boolean catenateWords;
|
||||
public static final int CATENATE_WORDS = 4;
|
||||
|
||||
/**
|
||||
* If true, causes maximum runs of number parts to be catenated:
|
||||
* Causes maximum runs of word parts to be catenated:
|
||||
* <p/>
|
||||
* "500-42" => "50042"
|
||||
* "wi-fi" => "wifi"
|
||||
*/
|
||||
final boolean catenateNumbers;
|
||||
public static final int CATENATE_NUMBERS = 8;
|
||||
|
||||
/**
|
||||
* If true, causes all subword parts to be catenated:
|
||||
* Causes all subword parts to be catenated:
|
||||
* <p/>
|
||||
* "wi-fi-4000" => "wifi4000"
|
||||
*/
|
||||
final boolean catenateAll;
|
||||
public static final int CATENATE_ALL = 16;
|
||||
|
||||
/**
|
||||
* If true, original words are preserved and added to the subword list (Defaults to false)
|
||||
* Causes original words are preserved and added to the subword list (Defaults to false)
|
||||
* <p/>
|
||||
* "500-42" => "500" "42" "500-42"
|
||||
*/
|
||||
final boolean preserveOriginal;
|
||||
public static final int PRESERVE_ORIGINAL = 32;
|
||||
|
||||
/**
|
||||
* If not set, causes case changes to be ignored (subwords will only be generated
|
||||
* given SUBWORD_DELIM tokens)
|
||||
*/
|
||||
public static final int SPLIT_ON_CASE_CHANGE = 64;
|
||||
|
||||
/**
|
||||
* If not set, causes numeric changes to be ignored (subwords will only be generated
|
||||
* given SUBWORD_DELIM tokens).
|
||||
*/
|
||||
public static final int SPLIT_ON_NUMERICS = 128;
|
||||
|
||||
/**
|
||||
* Causes trailing "'s" to be removed for each subword
|
||||
* <p/>
|
||||
* "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
public static final int STEM_ENGLISH_POSSESSIVE = 256;
|
||||
|
||||
/**
|
||||
* If not null is the set of tokens to protect from being delimited
|
||||
|
@ -119,6 +138,8 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
*/
|
||||
final CharArraySet protWords;
|
||||
|
||||
private final int flags;
|
||||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
|
@ -153,9 +174,37 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
// this token must have posInc=0!
|
||||
private boolean hasOutputFollowingOriginal = false;
|
||||
|
||||
/**
|
||||
* Creates a new WordDelimiterFilter
|
||||
*
|
||||
* @param in TokenStream to be filtered
|
||||
* @param charTypeTable table containing character types
|
||||
* @param configurationFlags Flags configuring the filter
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
|
||||
super(in);
|
||||
this.flags = configurationFlags;
|
||||
this.protWords = protWords;
|
||||
this.iterator = new WordDelimiterIterator(
|
||||
charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new WordDelimiterFilter using {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE}
|
||||
* as its charTypeTable
|
||||
*
|
||||
* @param in TokenStream to be filtered
|
||||
* @param configurationFlags Flags configuring the filter
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
|
||||
this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param in Token stream to be filtered.
|
||||
* @param charTypeTable
|
||||
* @param charTypeTable table containing character types
|
||||
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
|
||||
* @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
|
||||
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
|
||||
|
@ -166,7 +215,9 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
* @deprecated Use {@link #WordDelimiterFilter(TokenStream, byte[], int, CharArraySet)}
|
||||
*/
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in,
|
||||
byte[] charTypeTable,
|
||||
int generateWordParts,
|
||||
|
@ -180,14 +231,38 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
int stemEnglishPossessive,
|
||||
CharArraySet protWords) {
|
||||
super(in);
|
||||
this.generateWordParts = generateWordParts != 0;
|
||||
this.generateNumberParts = generateNumberParts != 0;
|
||||
this.catenateWords = catenateWords != 0;
|
||||
this.catenateNumbers = catenateNumbers != 0;
|
||||
this.catenateAll = catenateAll != 0;
|
||||
this.preserveOriginal = preserveOriginal != 0;
|
||||
|
||||
int flags = 0;
|
||||
if (generateWordParts != 0) {
|
||||
flags |= GENERATE_WORD_PARTS;
|
||||
}
|
||||
if (generateNumberParts != 0) {
|
||||
flags |= GENERATE_NUMBER_PARTS;
|
||||
}
|
||||
if (catenateWords != 0) {
|
||||
flags |= CATENATE_WORDS;
|
||||
}
|
||||
if (catenateNumbers != 0) {
|
||||
flags |= CATENATE_NUMBERS;
|
||||
}
|
||||
if (catenateAll != 0) {
|
||||
flags |= CATENATE_ALL;
|
||||
}
|
||||
if (preserveOriginal != 0) {
|
||||
flags |= PRESERVE_ORIGINAL;
|
||||
}
|
||||
if (splitOnCaseChange != 0) {
|
||||
flags |= SPLIT_ON_CASE_CHANGE;
|
||||
}
|
||||
if (splitOnNumerics != 0) {
|
||||
flags |= SPLIT_ON_NUMERICS;
|
||||
}
|
||||
if (stemEnglishPossessive != 0) {
|
||||
flags |= STEM_ENGLISH_POSSESSIVE;
|
||||
}
|
||||
this.protWords = protWords;
|
||||
this.iterator = new WordDelimiterIterator(charTypeTable, splitOnCaseChange != 0, splitOnNumerics != 0, stemEnglishPossessive != 0);
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -202,7 +277,9 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
* @deprecated Use {@link #WordDelimiterFilter(TokenStream, int, CharArraySet)}
|
||||
*/
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in,
|
||||
int generateWordParts,
|
||||
int generateNumberParts,
|
||||
|
@ -242,7 +319,7 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
// word of simply delimiters
|
||||
if (iterator.end == WordDelimiterIterator.DONE && !preserveOriginal) {
|
||||
if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL)) {
|
||||
// if the posInc is 1, simply ignore it in the accumulation
|
||||
if (posIncAttribute.getPositionIncrement() == 1) {
|
||||
accumPosInc--;
|
||||
|
@ -253,10 +330,10 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
saveState();
|
||||
|
||||
hasOutputToken = false;
|
||||
hasOutputFollowingOriginal = !preserveOriginal;
|
||||
hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL);
|
||||
lastConcatCount = 0;
|
||||
|
||||
if (preserveOriginal) {
|
||||
if (has(PRESERVE_ORIGINAL)) {
|
||||
posIncAttribute.setPositionIncrement(accumPosInc);
|
||||
accumPosInc = 0;
|
||||
return true;
|
||||
|
@ -312,7 +389,7 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
// add all subwords (catenateAll)
|
||||
if (catenateAll) {
|
||||
if (has(CATENATE_ALL)) {
|
||||
concatenate(concatAll);
|
||||
}
|
||||
|
||||
|
@ -385,7 +462,7 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
* @return {@code true} if concatenation should occur, {@code false} otherwise
|
||||
*/
|
||||
private boolean shouldConcatenate(int wordType) {
|
||||
return (catenateWords && isAlpha(wordType)) || (catenateNumbers && isDigit(wordType));
|
||||
return (has(CATENATE_WORDS) && isAlpha(wordType)) || (has(CATENATE_NUMBERS) && isDigit(wordType));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -395,7 +472,7 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
* @return {@code true} if a word/number part should be generated, {@code false} otherwise
|
||||
*/
|
||||
private boolean shouldGenerateParts(int wordType) {
|
||||
return (generateWordParts && isAlpha(wordType)) || (generateNumberParts && isDigit(wordType));
|
||||
return (has(GENERATE_WORD_PARTS) && isAlpha(wordType)) || (has(GENERATE_NUMBER_PARTS) && isDigit(wordType));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -494,6 +571,16 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
return (type & UPPER) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the given flag is set
|
||||
*
|
||||
* @param flag Flag to see if set
|
||||
* @return {@code} true if flag is set
|
||||
*/
|
||||
private boolean has(int flag) {
|
||||
return (flags & flag) != 0;
|
||||
}
|
||||
|
||||
// ================================================= Inner Classes =================================================
|
||||
|
||||
/**
|
||||
|
|
|
@ -36,6 +36,9 @@ import java.io.StringReader;
|
|||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
|
||||
|
||||
/**
|
||||
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
|
||||
* TODO: should explicitly test things like protWords and not rely on
|
||||
|
@ -63,17 +66,17 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
@Test
|
||||
public void testOffsets() throws IOException {
|
||||
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
// test that subwords and catenated subwords have
|
||||
// the correct offsets.
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 1, 1, 0, 1, 1, null);
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "foo", "bar", "foobar" },
|
||||
new int[] { 5, 9, 5 },
|
||||
new int[] { 8, 12, 12 });
|
||||
|
||||
wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 1, 1, 0, 1, 1, null);
|
||||
wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "foo", "bar", "foobar" },
|
||||
|
@ -82,9 +85,9 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testOffsetChange() throws Exception
|
||||
{
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 1, 1, 0, 1, 1, null);
|
||||
public void testOffsetChange() throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "übelkeit" },
|
||||
|
@ -93,9 +96,9 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testOffsetChange2() throws Exception
|
||||
{
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 1, 1, 0, 1, 1, null);
|
||||
public void testOffsetChange2() throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "übelkeit" },
|
||||
|
@ -104,9 +107,9 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testOffsetChange3() throws Exception
|
||||
{
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 1, 1, 0, 1, 1, null);
|
||||
public void testOffsetChange3() throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "übelkeit" },
|
||||
|
@ -115,9 +118,9 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testOffsetChange4() throws Exception
|
||||
{
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 1, 1, 0, 1, 1, null);
|
||||
public void testOffsetChange4() throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "foo", "bar", "foobar"},
|
||||
|
@ -126,8 +129,9 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void doSplit(final String input, String... output) throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer(
|
||||
new StringReader(input), MockTokenizer.KEYWORD, false), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 1, 1, 0, 0, 0, 1, 0, 1, 1, null);
|
||||
new StringReader(input), MockTokenizer.KEYWORD, false), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
||||
assertTokenStreamContents(wdf, output);
|
||||
}
|
||||
|
@ -168,8 +172,10 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
|
||||
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
|
||||
flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new MockTokenizer(
|
||||
new StringReader(input), MockTokenizer.KEYWORD, false), 1,1,0,0,0,1,0,1,stemPossessive, null);
|
||||
new StringReader(input), MockTokenizer.KEYWORD, false), flags, null);
|
||||
|
||||
assertTokenStreamContents(wdf, output);
|
||||
}
|
||||
|
@ -208,6 +214,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
@Test
|
||||
public void testPositionIncrements() throws Exception {
|
||||
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
final CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("NUTCH")), false);
|
||||
|
||||
/* analyzer that uses whitespace + wdf */
|
||||
|
@ -216,7 +223,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
public TokenStream tokenStream(String field, Reader reader) {
|
||||
return new WordDelimiterFilter(
|
||||
new MockTokenizer(reader, MockTokenizer.WHITESPACE, false),
|
||||
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
||||
flags, protWords);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -244,7 +251,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
return new WordDelimiterFilter(
|
||||
new LargePosIncTokenFilter(
|
||||
new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)),
|
||||
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
||||
flags, protWords);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -277,8 +284,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
|
||||
new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), StandardAnalyzer.STOP_WORDS_SET);
|
||||
filter.setEnablePositionIncrements(true);
|
||||
return new WordDelimiterFilter(filter,
|
||||
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
||||
return new WordDelimiterFilter(filter, flags, protWords);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -35,6 +35,8 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
|
||||
|
||||
|
||||
/**
|
||||
* Factory for {@link WordDelimiterFilter}.
|
||||
|
@ -80,38 +82,44 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
|
|||
}
|
||||
|
||||
private CharArraySet protectedWords = null;
|
||||
|
||||
int generateWordParts=0;
|
||||
int generateNumberParts=0;
|
||||
int catenateWords=0;
|
||||
int catenateNumbers=0;
|
||||
int catenateAll=0;
|
||||
int splitOnCaseChange=0;
|
||||
int splitOnNumerics=0;
|
||||
int preserveOriginal=0;
|
||||
int stemEnglishPossessive=0;
|
||||
private int flags;
|
||||
byte[] typeTable = null;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
generateWordParts = getInt("generateWordParts", 1);
|
||||
generateNumberParts = getInt("generateNumberParts", 1);
|
||||
catenateWords = getInt("catenateWords", 0);
|
||||
catenateNumbers = getInt("catenateNumbers", 0);
|
||||
catenateAll = getInt("catenateAll", 0);
|
||||
splitOnCaseChange = getInt("splitOnCaseChange", 1);
|
||||
splitOnNumerics = getInt("splitOnNumerics", 1);
|
||||
preserveOriginal = getInt("preserveOriginal", 0);
|
||||
stemEnglishPossessive = getInt("stemEnglishPossessive", 1);
|
||||
if (getInt("generateWordParts", 1) != 0) {
|
||||
flags |= GENERATE_WORD_PARTS;
|
||||
}
|
||||
if (getInt("generateNumberParts", 1) != 0) {
|
||||
flags |= GENERATE_NUMBER_PARTS;
|
||||
}
|
||||
if (getInt("catenateWords", 0) != 0) {
|
||||
flags |= CATENATE_WORDS;
|
||||
}
|
||||
if (getInt("catenateNumbers", 0) != 0) {
|
||||
flags |= CATENATE_NUMBERS;
|
||||
}
|
||||
if (getInt("catenateAll", 0) != 0) {
|
||||
flags |= CATENATE_ALL;
|
||||
}
|
||||
if (getInt("splitOnCaseChange", 1) != 0) {
|
||||
flags |= SPLIT_ON_CASE_CHANGE;
|
||||
}
|
||||
if (getInt("splitOnNumerics", 1) != 0) {
|
||||
flags |= SPLIT_ON_NUMERICS;
|
||||
}
|
||||
if (getInt("preserveOriginal", 0) != 0) {
|
||||
flags |= PRESERVE_ORIGINAL;
|
||||
}
|
||||
if (getInt("stemEnglishPossessive", 1) != 0) {
|
||||
flags |= STEM_ENGLISH_POSSESSIVE;
|
||||
}
|
||||
}
|
||||
|
||||
public WordDelimiterFilter create(TokenStream input) {
|
||||
return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
|
||||
generateWordParts, generateNumberParts,
|
||||
catenateWords, catenateNumbers, catenateAll,
|
||||
splitOnCaseChange, preserveOriginal,
|
||||
splitOnNumerics, stemEnglishPossessive, protectedWords);
|
||||
flags, protectedWords);
|
||||
}
|
||||
|
||||
// source => type
|
||||
|
@ -144,17 +152,17 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
|
|||
|
||||
private Byte parseType(String s) {
|
||||
if (s.equals("LOWER"))
|
||||
return WordDelimiterFilter.LOWER;
|
||||
return LOWER;
|
||||
else if (s.equals("UPPER"))
|
||||
return WordDelimiterFilter.UPPER;
|
||||
return UPPER;
|
||||
else if (s.equals("ALPHA"))
|
||||
return WordDelimiterFilter.ALPHA;
|
||||
return ALPHA;
|
||||
else if (s.equals("DIGIT"))
|
||||
return WordDelimiterFilter.DIGIT;
|
||||
return DIGIT;
|
||||
else if (s.equals("ALPHANUM"))
|
||||
return WordDelimiterFilter.ALPHANUM;
|
||||
return ALPHANUM;
|
||||
else if (s.equals("SUBWORD_DELIM"))
|
||||
return WordDelimiterFilter.SUBWORD_DELIM;
|
||||
return SUBWORD_DELIM;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue