mirror of https://github.com/apache/lucene.git
SOLR-1078: WDF - treat non-spacing marks,etc like letters
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@773085 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
acb387952d
commit
79414d2b72
|
@ -361,6 +361,10 @@ Bug Fixes
|
|||
|
||||
44. SOLR-1141: NullPointerException during snapshoot command in java based replication (Jian Han Guo, shalin)
|
||||
|
||||
45. SOLR-1078: Fixes to WordDelimiterFilter to avoid splitting or dropping
|
||||
international non-letter characters such as non spacing marks. (yonik)
|
||||
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
1. Upgraded to Lucene 2.4.0 (yonik)
|
||||
|
|
|
@ -242,15 +242,51 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
|
||||
}
|
||||
|
||||
|
||||
int charType(int ch) {
|
||||
if (ch<charTypeTable.length) {
|
||||
return charTypeTable[ch];
|
||||
} else if (Character.isLowerCase(ch)) {
|
||||
return LOWER;
|
||||
} else if (Character.isLetter(ch)) {
|
||||
return UPPER;
|
||||
} else {
|
||||
return SUBWORD_DELIM;
|
||||
}
|
||||
switch (Character.getType(ch)) {
|
||||
case Character.UPPERCASE_LETTER: return UPPER;
|
||||
case Character.LOWERCASE_LETTER: return LOWER;
|
||||
|
||||
case Character.TITLECASE_LETTER:
|
||||
case Character.MODIFIER_LETTER:
|
||||
case Character.OTHER_LETTER:
|
||||
case Character.NON_SPACING_MARK:
|
||||
case Character.ENCLOSING_MARK: // depends what it encloses?
|
||||
case Character.COMBINING_SPACING_MARK:
|
||||
return ALPHA;
|
||||
|
||||
case Character.DECIMAL_DIGIT_NUMBER:
|
||||
case Character.LETTER_NUMBER:
|
||||
case Character.OTHER_NUMBER:
|
||||
return DIGIT;
|
||||
|
||||
// case Character.SPACE_SEPARATOR:
|
||||
// case Character.LINE_SEPARATOR:
|
||||
// case Character.PARAGRAPH_SEPARATOR:
|
||||
// case Character.CONTROL:
|
||||
// case Character.FORMAT:
|
||||
// case Character.PRIVATE_USE:
|
||||
|
||||
case Character.SURROGATE: // prevent splitting
|
||||
return ALPHA|DIGIT;
|
||||
|
||||
// case Character.DASH_PUNCTUATION:
|
||||
// case Character.START_PUNCTUATION:
|
||||
// case Character.END_PUNCTUATION:
|
||||
// case Character.CONNECTOR_PUNCTUATION:
|
||||
// case Character.OTHER_PUNCTUATION:
|
||||
// case Character.MATH_SYMBOL:
|
||||
// case Character.CURRENCY_SYMBOL:
|
||||
// case Character.MODIFIER_SYMBOL:
|
||||
// case Character.OTHER_SYMBOL:
|
||||
// case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
// case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
|
||||
default: return SUBWORD_DELIM;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -348,7 +384,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
|
||||
while (pos< len) {
|
||||
|
||||
if (type!=lastType) {
|
||||
if ((type & lastType)==0) { // no overlap in character type
|
||||
// check and remove "'s" from the end of a token.
|
||||
// the pattern to check for is
|
||||
// ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
|
||||
|
@ -389,9 +425,8 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
if (splitOnCaseChange == 0 &&
|
||||
(lastType & ALPHA) != 0 && (type & ALPHA) != 0) {
|
||||
// ALPHA->ALPHA: always ignore if case isn't considered.
|
||||
|
||||
} else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
|
||||
// UPPER->LOWER: Don't split
|
||||
} else if ((lastType & UPPER)!=0 && (type & ALPHA)!=0) {
|
||||
// UPPER->letter: Don't split
|
||||
} else if(splitOnNumerics == 0 &&
|
||||
( ((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0) ) ) {
|
||||
// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
|
||||
|
|
|
@ -327,4 +327,37 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
);
|
||||
}
|
||||
|
||||
|
||||
public void doSplit(final String input, String... output) throws Exception {
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
|
||||
boolean done=false;
|
||||
@Override
|
||||
public Token next() throws IOException {
|
||||
if (done) return null;
|
||||
done = true;
|
||||
return new Token(input,0,input.length());
|
||||
}
|
||||
}
|
||||
,1,1,0,0,0
|
||||
);
|
||||
|
||||
for(String expected : output) {
|
||||
Token t = wdf.next();
|
||||
assertEquals(expected, t.term());
|
||||
}
|
||||
|
||||
assertEquals(null, wdf.next());
|
||||
}
|
||||
|
||||
public void testSplits() throws Exception {
|
||||
doSplit("basic-split","basic","split");
|
||||
doSplit("camelCase","camel","Case");
|
||||
|
||||
// non-space marking symbol shouldn't cause split
|
||||
// this is an example in Thai
|
||||
doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue