SOLR-876: WordDelimiterFilter splitOnNumerics and protwords

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@727677 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2008-12-18 08:50:27 +00:00
parent 663f27bd80
commit 39c1b9b3a5
6 changed files with 188 additions and 7 deletions

View File

@ -118,6 +118,9 @@ New Features
optimized distributed faceting refinement by lowering parsing overhead and optimized distributed faceting refinement by lowering parsing overhead and
by making requests and responses smaller. by making requests and responses smaller.
15. SOLR-876: WOrdDelimiterFilter now supports a splitOnNumerics
option, as well as a list of protected terms.
(Dan Rosher via hossman)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -140,6 +141,18 @@ final class WordDelimiterFilter extends TokenFilter {
*/ */
final int preserveOriginal; final int preserveOriginal;
/**
* If 0, causes numeric changes to be ignored (subwords will only be generated
* given SUBWORD_DELIM tokens). (Defaults to 1)
*/
final int splitOnNumerics;
/**
* If not null is the set of tokens to protect from being delimited
*
*/
final CharArraySet protWords;
/** /**
* *
* @param in Token stream to be filtered. * @param in Token stream to be filtered.
@ -151,8 +164,10 @@ final class WordDelimiterFilter extends TokenFilter {
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
* @param protWords If not null is the set of tokens to protect from being delimited
*/ */
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) { public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
super(in); super(in);
this.generateWordParts = generateWordParts; this.generateWordParts = generateWordParts;
this.generateNumberParts = generateNumberParts; this.generateNumberParts = generateNumberParts;
@ -162,7 +177,22 @@ final class WordDelimiterFilter extends TokenFilter {
this.splitOnCaseChange = splitOnCaseChange; this.splitOnCaseChange = splitOnCaseChange;
this.preserveOriginal = preserveOriginal; this.preserveOriginal = preserveOriginal;
this.charTypeTable = charTypeTable; this.charTypeTable = charTypeTable;
this.splitOnNumerics = splitOnNumerics;
this.protWords = protWords;
} }
/**
* Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, null);
}
/** /**
* @param in Token stream to be filtered. * @param in Token stream to be filtered.
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot" * @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
@ -172,7 +202,20 @@ final class WordDelimiterFilter extends TokenFilter {
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
* @param protWords If not null is the set of tokens to protect from being delimited
*/ */
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, protWords);
}
/** * Compatibility constructor
*
* @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* instead.
*/
@Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) { public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal); this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
} }
@ -180,23 +223,23 @@ final class WordDelimiterFilter extends TokenFilter {
* Compatibility constructor * Compatibility constructor
* *
* @deprecated Use * @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)} * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* instead. * instead.
*/ */
@Deprecated @Deprecated
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) { public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0); this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
} }
/** /**
* Compatibility constructor * Compatibility constructor
* *
* @deprecated Use * @deprecated Use
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)} * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
* instead. * instead.
*/ */
@Deprecated @Deprecated
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) { public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0); this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
} }
int charType(int ch) { int charType(int ch) {
@ -273,6 +316,11 @@ final class WordDelimiterFilter extends TokenFilter {
int start=0; int start=0;
if (len ==0) continue; if (len ==0) continue;
//skip protected tokens
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
return t;
}
origPosIncrement += t.getPositionIncrement(); origPosIncrement += t.getPositionIncrement();
// Avoid calling charType more than once for each char (basically // Avoid calling charType more than once for each char (basically
@ -344,6 +392,9 @@ final class WordDelimiterFilter extends TokenFilter {
} else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) { } else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
// UPPER->LOWER: Don't split // UPPER->LOWER: Don't split
} else if(splitOnNumerics == 0 &&
( ((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0) ) ) {
// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
} else { } else {
// NOTE: this code currently assumes that only one flag // NOTE: this code currently assumes that only one flag
// is set for each character now, so we don't have // is set for each character now, so we don't have

View File

@ -17,19 +17,59 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import java.util.Map; import java.util.Map;
import java.io.File;
import java.util.List;
import java.io.IOException;
/** /**
* @version $Id$ * @version $Id$
*/ */
public class WordDelimiterFilterFactory extends BaseTokenFilterFactory { public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
public void inform(ResourceLoader loader) {
String wordFiles = args.get(PROTECTED_TOKENS);
if (wordFiles != null) {
try {
File protectedWordFiles = new File(wordFiles);
if (protectedWordFiles.exists()) {
List<String> wlist = loader.getLines(wordFiles);
//This cast is safe in Lucene
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
} else {
List<String> files = StrUtils.splitFileNames(wordFiles);
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
if (protectedWords == null)
protectedWords = new CharArraySet(wlist, false);
else
protectedWords.addAll(wlist);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
private CharArraySet protectedWords = null;
int generateWordParts=0; int generateWordParts=0;
int generateNumberParts=0; int generateNumberParts=0;
int catenateWords=0; int catenateWords=0;
int catenateNumbers=0; int catenateNumbers=0;
int catenateAll=0; int catenateAll=0;
int splitOnCaseChange=0; int splitOnCaseChange=0;
int splitOnNumerics=0;
int preserveOriginal=0; int preserveOriginal=0;
@Override @Override
@ -41,6 +81,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
catenateNumbers = getInt("catenateNumbers", 0); catenateNumbers = getInt("catenateNumbers", 0);
catenateAll = getInt("catenateAll", 0); catenateAll = getInt("catenateAll", 0);
splitOnCaseChange = getInt("splitOnCaseChange", 1); splitOnCaseChange = getInt("splitOnCaseChange", 1);
splitOnNumerics = getInt("splitOnNumerics", 1);
preserveOriginal = getInt("preserveOriginal", 0); preserveOriginal = getInt("preserveOriginal", 0);
} }
@ -48,6 +89,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
return new WordDelimiterFilter(input, return new WordDelimiterFilter(input,
generateWordParts, generateNumberParts, generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll, catenateWords, catenateNumbers, catenateAll,
splitOnCaseChange, preserveOriginal); splitOnCaseChange, preserveOriginal,
splitOnNumerics, protectedWords);
} }
} }

View File

@ -278,4 +278,53 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
assertEquals(12, t.startOffset()); assertEquals(12, t.startOffset());
assertEquals(15, t.endOffset()); assertEquals(15, t.endOffset());
} }
public void testAlphaNumericWords(){
assertU(adoc("id", "68","numericsubword","Java/J2SE"));
assertU(commit());
assertQ("j2se found",
req("numericsubword:(J2SE)")
,"//result[@numFound=1]"
);
assertQ("no j2 or se",
req("numericsubword:(J2 OR SE)")
,"//result[@numFound=0]"
);
}
public void testProtectedWords(){
assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
assertU(commit());
assertQ("java found",
req("protectedsubword:(java)")
,"//result[@numFound=1]"
);
assertQ(".net found",
req("protectedsubword:(.net)")
,"//result[@numFound=1]"
);
assertQ("c# found",
req("protectedsubword:(c#)")
,"//result[@numFound=1]"
);
assertQ("c++ found",
req("protectedsubword:(c++)")
,"//result[@numFound=1]"
);
assertQ("c found?",
req("protectedsubword:c")
,"//result[@numFound=0]"
);
assertQ("net found?",
req("protectedsubword:net")
,"//result[@numFound=0]"
);
}
} }

View File

@ -18,3 +18,6 @@
#to test, we will use words that would normally obviously be stemmed. #to test, we will use words that would normally obviously be stemmed.
cats cats
ridding ridding
c#
c++
.net

View File

@ -252,6 +252,36 @@
</analyzer> </analyzer>
</fieldtype> </fieldtype>
<fieldtype name="numericsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="protectedsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<!-- more flexible in matching skus, but more chance of a false match --> <!-- more flexible in matching skus, but more chance of a false match -->
<fieldtype name="skutype1" class="solr.TextField"> <fieldtype name="skutype1" class="solr.TextField">
<analyzer type="index"> <analyzer type="index">
@ -387,6 +417,9 @@
<field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/> <field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/>
<field name="subword" type="subword" indexed="true" stored="true"/> <field name="subword" type="subword" indexed="true" stored="true"/>
<field name="numericsubword" type="numericsubword" indexed="true" stored="true"/>
<field name="protectedsubword" type="protectedsubword" indexed="true" stored="true"/>
<field name="sku1" type="skutype1" indexed="true" stored="true"/> <field name="sku1" type="skutype1" indexed="true" stored="true"/>
<field name="sku2" type="skutype2" indexed="true" stored="true"/> <field name="sku2" type="skutype2" indexed="true" stored="true"/>