From 79414d2b72631c82cf76235cc61e2efd812e7cc1 Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Fri, 8 May 2009 20:25:07 +0000 Subject: [PATCH] SOLR-1078: WDF - treat non-spacing marks,etc like letters git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@773085 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 4 ++ .../solr/analysis/WordDelimiterFilter.java | 55 +++++++++++++++---- .../analysis/TestWordDelimiterFilter.java | 33 +++++++++++ 3 files changed, 82 insertions(+), 10 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index d1385bcec50..78df7c32bda 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -361,6 +361,10 @@ Bug Fixes 44. SOLR-1141: NullPointerException during snapshoot command in java based replication (Jian Han Guo, shalin) +45. SOLR-1078: Fixes to WordDelimiterFilter to avoid splitting or dropping + international non-letter characters such as non spacing marks. (yonik) + + Other Changes ---------------------- 1. Upgraded to Lucene 2.4.0 (yonik) diff --git a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java index 1818b74c06e..ad4310e0be4 100644 --- a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java +++ b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java @@ -242,15 +242,51 @@ final class WordDelimiterFilter extends TokenFilter { this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null); } + int charType(int ch) { if (chALPHA: always ignore if case isn't considered. - - } else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) { - // UPPER->LOWER: Don't split + } else if ((lastType & UPPER)!=0 && (type & ALPHA)!=0) { + // UPPER->letter: Don't split } else if(splitOnNumerics == 0 && ( ((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0) ) ) { // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split diff --git a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java index dda71ea173e..805c81ab75e 100644 --- a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java +++ b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java @@ -327,4 +327,37 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase { ); } + + public void doSplit(final String input, String... output) throws Exception { + WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() { + boolean done=false; + @Override + public Token next() throws IOException { + if (done) return null; + done = true; + return new Token(input,0,input.length()); + } + } + ,1,1,0,0,0 + ); + + for(String expected : output) { + Token t = wdf.next(); + assertEquals(expected, t.term()); + } + + assertEquals(null, wdf.next()); + } + + public void testSplits() throws Exception { + doSplit("basic-split","basic","split"); + doSplit("camelCase","camel","Case"); + + // non-space marking symbol shouldn't cause split + // this is an example in Thai + doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19"); + + + } + }