SOLR-257: add splitOnCaseChange parameter to WordDelimiterFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@545597 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mike Klaas 2007-06-08 19:02:27 +00:00
parent 8454db1be1
commit 39e0ff32f2
6 changed files with 68 additions and 14 deletions

View File

@ -37,6 +37,9 @@ New Features
that keeps tokens with text in the registered keeplist. This behaves like
the inverse of StopFilter. (ryan)
3. SOLR-257: WordDelimiterFilter has a new parameter splitOnCaseChange,
which can be set to 0 to disable splitting "PowerShot" => "Power" "Shot"
Changes in runtime behavior
Optimizations

View File

@ -151,7 +151,7 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="1" catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter;
@ -61,6 +61,7 @@ import java.util.List;
* @author yonik
* @version $Id$
*/
final class WordDelimiterFilter extends TokenFilter {
private final byte[] charTypeTable;
@ -127,6 +128,12 @@ final class WordDelimiterFilter extends TokenFilter {
*/
final int catenateAll;
/**
* If 0, causes case changes to be ignored (subwords will only be generated
* given SUBWORD_DELIM tokens). (Defaults to 1)
*/
final int splitOnCaseChange;
/**
*
* @param in Token stream to be filtered.
@ -136,27 +143,39 @@ final class WordDelimiterFilter extends TokenFilter {
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
*/
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) {
super(in);
this.generateWordParts = generateWordParts;
this.generateNumberParts = generateNumberParts;
this.catenateWords = catenateWords;
this.catenateNumbers = catenateNumbers;
this.catenateAll = catenateAll;
this.splitOnCaseChange = splitOnCaseChange;
this.charTypeTable = charTypeTable;
}
/**
* @param in Token stream to be filtered.
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
* @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
*/
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange);
}
/** Compatibility constructor */
@Deprecated
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1);
}
/** Compatibility constructor */
@Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll);
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1);
}
int charType(int ch) {
@ -289,7 +308,11 @@ final class WordDelimiterFilter extends TokenFilter {
// It will also handle pluralization of
// an uppercase word such as FOOs (won't split).
if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
if (splitOnCaseChange == 0 &&
(lastType & ALPHA) != 0 && (type & ALPHA) != 0) {
// ALPHA->ALPHA: always ignore if case isn't considered.
} else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
// UPPER->LOWER: Don't split
} else {
// NOTE: this code currently assumes that only one flag

View File

@ -31,19 +31,22 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
int catenateWords=0;
int catenateNumbers=0;
int catenateAll=0;
int splitOnCaseChange=0;
public void init(Map<String, String> args) {
super.init(args);
generateWordParts = getInt("generateWordParts",1);
generateNumberParts = getInt("generateNumberParts",1);
catenateWords = getInt("catenateWords",0);
catenateNumbers = getInt("catenateNumbers",0);
catenateAll = getInt("catenateAll",0);
generateWordParts = getInt("generateWordParts", 1);
generateNumberParts = getInt("generateNumberParts", 1);
catenateWords = getInt("catenateWords", 0);
catenateNumbers = getInt("catenateNumbers", 0);
catenateAll = getInt("catenateAll", 0);
splitOnCaseChange = getInt("splitOnCaseChange", 1);
}
public TokenStream create(TokenStream input) {
return new WordDelimiterFilter(input,
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll);
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll,
splitOnCaseChange);
}
}

View File

@ -66,5 +66,20 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
}
public void testIgnoreCaseChange() {
assertU(adoc("id", "43",
"wdf_nocase", "HellO WilliAM",
"subword", "GoodBye JonEs"));
assertU(commit());
assertQ("no case change",
req("wdf_nocase:(hell o am)")
,"//result[@numFound=0]"
);
assertQ("case change",
req("subword:(good jon)")
,"//result[@numFound=1]"
);
}
}

View File

@ -82,6 +82,15 @@
</analyzer>
</fieldtype>
<!-- Demonstrating ignoreCaseChange -->
<fieldtype name="wdf_nocase" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<!-- HighlitText optimizes storage for (long) columns which will be highlit -->
<fieldtype name="highlittext" class="solr.TextField" compressThreshold="345" />
@ -359,6 +368,7 @@
<field name="custstopfilt" type="custstopfilt" indexed="true" stored="true"/>
<field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
<field name="dedup" type="dedup" indexed="true" stored="true"/>
<field name="wdf_nocase" type="wdf_nocase" indexed="true" stored="true"/>
<field name="numberpartfail" type="failtype1" indexed="true" stored="true"/>