mirror of https://github.com/apache/lucene.git
SOLR-1266: add WordDelimiterFilter.stemEnglishPossessive option
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@793090 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2d1dae0056
commit
8bb8d6561c
|
@ -250,6 +250,11 @@ New Features
|
|||
|
||||
64. SOLR-1256: Show the output of CharFilters in analysis.jsp. (koji)
|
||||
|
||||
65. SOLR-1266: Added stemEnglishPossessive option (default=true) to WordDelimiterFilter
|
||||
that allows disabling of english possessive stemming (removal of trailing 's from tokens)
|
||||
(Robert Muir via yonik)
|
||||
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
|
||||
|
|
|
@ -147,6 +147,13 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
*/
|
||||
final int splitOnNumerics;
|
||||
|
||||
/**
|
||||
* If 1, causes trailing "'s" to be removed for each subword. (Defaults to 1)
|
||||
* <p/>
|
||||
* "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
final int stemEnglishPossessive;
|
||||
|
||||
/**
|
||||
* If not null is the set of tokens to protect from being delimited
|
||||
*
|
||||
|
@ -165,9 +172,10 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
|
||||
super(in);
|
||||
this.generateWordParts = generateWordParts;
|
||||
this.generateNumberParts = generateNumberParts;
|
||||
|
@ -178,14 +186,27 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
this.preserveOriginal = preserveOriginal;
|
||||
this.charTypeTable = charTypeTable;
|
||||
this.splitOnNumerics = splitOnNumerics;
|
||||
this.stemEnglishPossessive = stemEnglishPossessive;
|
||||
this.protWords = protWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compatibility constructor
|
||||
*
|
||||
* @deprecated Use
|
||||
* {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
|
||||
this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, 1, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compatibility constructor
|
||||
*
|
||||
* @deprecated Use
|
||||
* {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* instead.
|
||||
*/
|
||||
@Deprecated
|
||||
|
@ -203,16 +224,27 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, protWords);
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords);
|
||||
}
|
||||
|
||||
/** * Compatibility constructor
|
||||
*
|
||||
* @deprecated Use
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* instead.
|
||||
*/
|
||||
@Deprecated
|
||||
|
@ -223,7 +255,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
* Compatibility constructor
|
||||
*
|
||||
* @deprecated Use
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* instead.
|
||||
*/
|
||||
@Deprecated
|
||||
|
@ -234,7 +266,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
* Compatibility constructor
|
||||
*
|
||||
* @deprecated Use
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* instead.
|
||||
*/
|
||||
@Deprecated
|
||||
|
@ -388,7 +420,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
// check and remove "'s" from the end of a token.
|
||||
// the pattern to check for is
|
||||
// ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
|
||||
if ((lastType & ALPHA)!=0) {
|
||||
if (stemEnglishPossessive != 0 && ((lastType & ALPHA)!=0)) {
|
||||
if (ch=='\'' && pos+1< len
|
||||
&& (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
|
||||
{
|
||||
|
|
|
@ -71,6 +71,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
|
|||
int splitOnCaseChange=0;
|
||||
int splitOnNumerics=0;
|
||||
int preserveOriginal=0;
|
||||
int stemEnglishPossessive=0;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
|
@ -83,6 +84,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
|
|||
splitOnCaseChange = getInt("splitOnCaseChange", 1);
|
||||
splitOnNumerics = getInt("splitOnNumerics", 1);
|
||||
preserveOriginal = getInt("preserveOriginal", 0);
|
||||
stemEnglishPossessive = getInt("stemEnglishPossessive", 1);
|
||||
}
|
||||
|
||||
public WordDelimiterFilter create(TokenStream input) {
|
||||
|
@ -90,6 +92,6 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement
|
|||
generateWordParts, generateNumberParts,
|
||||
catenateWords, catenateNumbers, catenateAll,
|
||||
splitOnCaseChange, preserveOriginal,
|
||||
splitOnNumerics, protectedWords);
|
||||
splitOnNumerics, stemEnglishPossessive, protectedWords);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -359,5 +359,34 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
|
||||
|
||||
}
|
||||
|
||||
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
|
||||
boolean done=false;
|
||||
@Override
|
||||
public Token next() throws IOException {
|
||||
if (done) return null;
|
||||
done = true;
|
||||
return new Token(input,0,input.length());
|
||||
}
|
||||
}
|
||||
,1,1,0,0,0,1,0,1,stemPossessive,null
|
||||
);
|
||||
|
||||
for(String expected : output) {
|
||||
Token t = wdf.next();
|
||||
assertEquals(expected, t.term());
|
||||
}
|
||||
|
||||
assertEquals(null, wdf.next());
|
||||
}
|
||||
|
||||
/*
|
||||
* Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
|
||||
*/
|
||||
public void testPossessives() throws Exception {
|
||||
doSplitPossessive(1, "ra's", "ra");
|
||||
doSplitPossessive(0, "ra's", "ra", "s");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue