mirror of https://github.com/apache/lucene.git
SOLR-257: add splitOnCaseChange parameter to WordDelimiterFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@545597 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8454db1be1
commit
39e0ff32f2
|
@ -37,6 +37,9 @@ New Features
|
|||
that keeps tokens with text in the registered keeplist. This behaves like
|
||||
the inverse of StopFilter. (ryan)
|
||||
|
||||
3. SOLR-257: WordDelimiterFilter has a new parameter splitOnCaseChange,
|
||||
which can be set to 0 to disable splitting "PowerShot" => "Power" "Shot"
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
Optimizations
|
||||
|
|
|
@ -151,7 +151,7 @@
|
|||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="1" catenateNumbers="0" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
@ -61,6 +61,7 @@ import java.util.List;
|
|||
* @author yonik
|
||||
* @version $Id$
|
||||
*/
|
||||
|
||||
final class WordDelimiterFilter extends TokenFilter {
|
||||
private final byte[] charTypeTable;
|
||||
|
||||
|
@ -127,6 +128,12 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
*/
|
||||
final int catenateAll;
|
||||
|
||||
/**
|
||||
* If 0, causes case changes to be ignored (subwords will only be generated
|
||||
* given SUBWORD_DELIM tokens). (Defaults to 1)
|
||||
*/
|
||||
final int splitOnCaseChange;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param in Token stream to be filtered.
|
||||
|
@ -136,27 +143,39 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
|
||||
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
|
||||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) {
|
||||
super(in);
|
||||
this.generateWordParts = generateWordParts;
|
||||
this.generateNumberParts = generateNumberParts;
|
||||
this.catenateWords = catenateWords;
|
||||
this.catenateNumbers = catenateNumbers;
|
||||
this.catenateAll = catenateAll;
|
||||
this.splitOnCaseChange = splitOnCaseChange;
|
||||
this.charTypeTable = charTypeTable;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param in Token stream to be filtered.
|
||||
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
|
||||
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
|
||||
* @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
|
||||
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
|
||||
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
|
||||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange) {
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange);
|
||||
}
|
||||
/** Compatibility constructor */
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
||||
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1);
|
||||
}
|
||||
/** Compatibility constructor */
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll);
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1);
|
||||
}
|
||||
|
||||
int charType(int ch) {
|
||||
|
@ -289,7 +308,11 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
// It will also handle pluralization of
|
||||
// an uppercase word such as FOOs (won't split).
|
||||
|
||||
if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
|
||||
if (splitOnCaseChange == 0 &&
|
||||
(lastType & ALPHA) != 0 && (type & ALPHA) != 0) {
|
||||
// ALPHA->ALPHA: always ignore if case isn't considered.
|
||||
|
||||
} else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
|
||||
// UPPER->LOWER: Don't split
|
||||
} else {
|
||||
// NOTE: this code currently assumes that only one flag
|
||||
|
|
|
@ -31,19 +31,22 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
|
|||
int catenateWords=0;
|
||||
int catenateNumbers=0;
|
||||
int catenateAll=0;
|
||||
int splitOnCaseChange=0;
|
||||
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
generateWordParts = getInt("generateWordParts",1);
|
||||
generateNumberParts = getInt("generateNumberParts",1);
|
||||
catenateWords = getInt("catenateWords",0);
|
||||
catenateNumbers = getInt("catenateNumbers",0);
|
||||
catenateAll = getInt("catenateAll",0);
|
||||
generateWordParts = getInt("generateWordParts", 1);
|
||||
generateNumberParts = getInt("generateNumberParts", 1);
|
||||
catenateWords = getInt("catenateWords", 0);
|
||||
catenateNumbers = getInt("catenateNumbers", 0);
|
||||
catenateAll = getInt("catenateAll", 0);
|
||||
splitOnCaseChange = getInt("splitOnCaseChange", 1);
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new WordDelimiterFilter(input,
|
||||
generateWordParts, generateNumberParts,
|
||||
catenateWords, catenateNumbers, catenateAll);
|
||||
generateWordParts, generateNumberParts,
|
||||
catenateWords, catenateNumbers, catenateAll,
|
||||
splitOnCaseChange);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -66,5 +66,20 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
|
||||
}
|
||||
|
||||
public void testIgnoreCaseChange() {
|
||||
|
||||
assertU(adoc("id", "43",
|
||||
"wdf_nocase", "HellO WilliAM",
|
||||
"subword", "GoodBye JonEs"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("no case change",
|
||||
req("wdf_nocase:(hell o am)")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
assertQ("case change",
|
||||
req("subword:(good jon)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -82,6 +82,15 @@
|
|||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<!-- Demonstrating ignoreCaseChange -->
|
||||
<fieldtype name="wdf_nocase" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
|
||||
<!-- HighlitText optimizes storage for (long) columns which will be highlit -->
|
||||
<fieldtype name="highlittext" class="solr.TextField" compressThreshold="345" />
|
||||
|
@ -359,6 +368,7 @@
|
|||
<field name="custstopfilt" type="custstopfilt" indexed="true" stored="true"/>
|
||||
<field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
|
||||
<field name="dedup" type="dedup" indexed="true" stored="true"/>
|
||||
<field name="wdf_nocase" type="wdf_nocase" indexed="true" stored="true"/>
|
||||
|
||||
<field name="numberpartfail" type="failtype1" indexed="true" stored="true"/>
|
||||
|
||||
|
|
Loading…
Reference in New Issue