SOLR-1266: add WordDelimiterFilter.stemEnglishPossessive option

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@793090 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-07-10 19:40:33 +00:00
parent 2d1dae0056
commit 8bb8d6561c
4 changed files with 76 additions and 8 deletions

View File

@ -250,6 +250,11 @@ New Features
64. SOLR-1256: Show the output of CharFilters in analysis.jsp. (koji)
65. SOLR-1266: Added stemEnglishPossessive option (default=true) to WordDelimiterFilter
that allows disabling of english possessive stemming (removal of trailing 's from tokens)
(Robert Muir via yonik)
Optimizations
----------------------
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the

View File

@ -147,6 +147,13 @@ final class WordDelimiterFilter extends TokenFilter {
*/
final int splitOnNumerics;
/**
* If 1, causes trailing "'s" to be removed for each subword. (Defaults to 1)
* <p/>
* "O'Neil's" => "O", "Neil"
*/
final int stemEnglishPossessive;
/**
* If not null is the set of tokens to protect from being delimited
*
@ -165,9 +172,10 @@ final class WordDelimiterFilter extends TokenFilter {
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
* @param protWords If not null is the set of tokens to protect from being delimited
*/
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
super(in);
this.generateWordParts = generateWordParts;
this.generateNumberParts = generateNumberParts;
@ -178,14 +186,27 @@ final class WordDelimiterFilter extends TokenFilter {
this.preserveOriginal = preserveOriginal;
this.charTypeTable = charTypeTable;
this.splitOnNumerics = splitOnNumerics;
this.stemEnglishPossessive = stemEnglishPossessive;
this.protWords = protWords;
}
/**
* Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, 1, null);
}
/**
* Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, CharArraySet)}
* {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
@ -203,16 +224,27 @@ final class WordDelimiterFilter extends TokenFilter {
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
* @param protWords If not null is the set of tokens to protect from being delimited
*/
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
}
/**
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, protWords);
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords);
}
/** * Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
@ -223,7 +255,7 @@ final class WordDelimiterFilter extends TokenFilter {
* Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
@ -234,7 +266,7 @@ final class WordDelimiterFilter extends TokenFilter {
* Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
@ -388,7 +420,7 @@ final class WordDelimiterFilter extends TokenFilter {
// check and remove "'s" from the end of a token.
// the pattern to check for is
// ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
if ((lastType & ALPHA)!=0) {
if (stemEnglishPossessive != 0 && ((lastType & ALPHA)!=0)) {
if (ch=='\'' && pos+1< len
&& (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
{

View File

@ -71,6 +71,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
int splitOnCaseChange=0;
int splitOnNumerics=0;
int preserveOriginal=0;
int stemEnglishPossessive=0;
@Override
public void init(Map<String, String> args) {
@ -83,6 +84,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
splitOnCaseChange = getInt("splitOnCaseChange", 1);
splitOnNumerics = getInt("splitOnNumerics", 1);
preserveOriginal = getInt("preserveOriginal", 0);
stemEnglishPossessive = getInt("stemEnglishPossessive", 1);
}
public WordDelimiterFilter create(TokenStream input) {
@ -90,6 +92,6 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll,
splitOnCaseChange, preserveOriginal,
splitOnNumerics, protectedWords);
splitOnNumerics, stemEnglishPossessive, protectedWords);
}
}

View File

@ -359,5 +359,34 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
}
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
boolean done=false;
@Override
public Token next() throws IOException {
if (done) return null;
done = true;
return new Token(input,0,input.length());
}
}
,1,1,0,0,0,1,0,1,stemPossessive,null
);
for(String expected : output) {
Token t = wdf.next();
assertEquals(expected, t.term());
}
assertEquals(null, wdf.next());
}
/*
* Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
*/
public void testPossessives() throws Exception {
doSplitPossessive(1, "ra's", "ra");
doSplitPossessive(0, "ra's", "ra", "s");
}
}