SOLR-876: WordDelimiterFilter splitOnNumerics and protwords

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@727677 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2008-12-18 08:50:27 +00:00
parent 663f27bd80
commit 39c1b9b3a5
6 changed files with 188 additions and 7 deletions

View File

@ -118,6 +118,9 @@ New Features
optimized distributed faceting refinement by lowering parsing overhead and
by making requests and responses smaller.
15. SOLR-876: WOrdDelimiterFilter now supports a splitOnNumerics
option, as well as a list of protected terms.
(Dan Rosher via hossman)
Optimizations
----------------------

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet;
import java.io.IOException;
import java.util.ArrayList;
@ -140,6 +141,18 @@ final class WordDelimiterFilter extends TokenFilter {
*/
final int preserveOriginal;
/**
* If 0, causes numeric changes to be ignored (subwords will only be generated
* given SUBWORD_DELIM tokens). (Defaults to 1)
*/
final int splitOnNumerics;
/**
* If not null is the set of tokens to protect from being delimited
*
*/
final CharArraySet protWords;
/**
*
* @param in Token stream to be filtered.
@ -151,8 +164,10 @@ final class WordDelimiterFilter extends TokenFilter {
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
* @param protWords If not null is the set of tokens to protect from being delimited
*/
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
super(in);
this.generateWordParts = generateWordParts;
this.generateNumberParts = generateNumberParts;
@ -162,7 +177,22 @@ final class WordDelimiterFilter extends TokenFilter {
this.splitOnCaseChange = splitOnCaseChange;
this.preserveOriginal = preserveOriginal;
this.charTypeTable = charTypeTable;
this.splitOnNumerics = splitOnNumerics;
this.protWords = protWords;
}
/**
* Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, null);
}
/**
* @param in Token stream to be filtered.
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
@ -172,7 +202,20 @@ final class WordDelimiterFilter extends TokenFilter {
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
* @param protWords If not null is the set of tokens to protect from being delimited
*/
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, protWords);
}
/** * Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
}
@ -180,23 +223,23 @@ final class WordDelimiterFilter extends TokenFilter {
* Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)}
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
}
/**
* Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)}
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
}
int charType(int ch) {
@ -273,6 +316,11 @@ final class WordDelimiterFilter extends TokenFilter {
int start=0;
if (len ==0) continue;
//skip protected tokens
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
return t;
}
origPosIncrement += t.getPositionIncrement();
// Avoid calling charType more than once for each char (basically
@ -344,6 +392,9 @@ final class WordDelimiterFilter extends TokenFilter {
} else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
// UPPER->LOWER: Don't split
} else if(splitOnNumerics == 0 &&
( ((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0) ) ) {
// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
} else {
// NOTE: this code currently assumes that only one flag
// is set for each character now, so we don't have

View File

@ -17,19 +17,59 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import java.util.Map;
import java.io.File;
import java.util.List;
import java.io.IOException;
/**
* @version $Id$
*/
public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
public void inform(ResourceLoader loader) {
String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) {
try {
File protectedWordFiles = new File(wordFiles);
if (protectedWordFiles.exists()) {
List<String> wlist = loader.getLines(wordFiles);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
if (protectedWords == null)
protectedWords = new CharArraySet(wlist, false);
else
protectedWords.addAll(wlist);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
private CharArraySet protectedWords = null;
int generateWordParts=0;
int generateNumberParts=0;
int catenateWords=0;
int catenateNumbers=0;
int catenateAll=0;
int splitOnCaseChange=0;
int splitOnNumerics=0;
int preserveOriginal=0;
@Override
@ -41,6 +81,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
catenateNumbers = getInt("catenateNumbers", 0);
catenateAll = getInt("catenateAll", 0);
splitOnCaseChange = getInt("splitOnCaseChange", 1);
splitOnNumerics = getInt("splitOnNumerics", 1);
preserveOriginal = getInt("preserveOriginal", 0);
}
@ -48,6 +89,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
return new WordDelimiterFilter(input,
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll,
splitOnCaseChange, preserveOriginal);
splitOnCaseChange, preserveOriginal,
splitOnNumerics, protectedWords);
}
}

View File

@ -278,4 +278,53 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
assertEquals(12, t.startOffset());
assertEquals(15, t.endOffset());
}
public void testAlphaNumericWords(){
assertU(adoc("id", "68","numericsubword","Java/J2SE"));
assertU(commit());
assertQ("j2se found",
req("numericsubword:(J2SE)")
,"//result[@numFound=1]"
);
assertQ("no j2 or se",
req("numericsubword:(J2 OR SE)")
,"//result[@numFound=0]"
);
}
public void testProtectedWords(){
assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
assertU(commit());
assertQ("java found",
req("protectedsubword:(java)")
,"//result[@numFound=1]"
);
assertQ(".net found",
req("protectedsubword:(.net)")
,"//result[@numFound=1]"
);
assertQ("c# found",
req("protectedsubword:(c#)")
,"//result[@numFound=1]"
);
assertQ("c++ found",
req("protectedsubword:(c++)")
,"//result[@numFound=1]"
);
assertQ("c found?",
req("protectedsubword:c")
,"//result[@numFound=0]"
);
assertQ("net found?",
req("protectedsubword:net")
,"//result[@numFound=0]"
);
}
}

View File

@ -18,3 +18,6 @@
#to test, we will use words that would normally obviously be stemmed.
cats
ridding
c#
c++
.net

View File

@ -252,6 +252,36 @@
</analyzer>
</fieldtype>
<fieldtype name="numericsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="protectedsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<!-- more flexible in matching skus, but more chance of a false match -->
<fieldtype name="skutype1" class="solr.TextField">
<analyzer type="index">
@ -387,6 +417,9 @@
<field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/>
<field name="subword" type="subword" indexed="true" stored="true"/>
<field name="numericsubword" type="numericsubword" indexed="true" stored="true"/>
<field name="protectedsubword" type="protectedsubword" indexed="true" stored="true"/>
<field name="sku1" type="skutype1" indexed="true" stored="true"/>
<field name="sku2" type="skutype2" indexed="true" stored="true"/>