mirror of https://github.com/apache/lucene.git
SOLR-876: WordDelimiterFilter splitOnNumerics and protwords
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@727677 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
663f27bd80
commit
39c1b9b3a5
|
@ -118,6 +118,9 @@ New Features
|
||||||
optimized distributed faceting refinement by lowering parsing overhead and
|
optimized distributed faceting refinement by lowering parsing overhead and
|
||||||
by making requests and responses smaller.
|
by making requests and responses smaller.
|
||||||
|
|
||||||
|
15. SOLR-876: WOrdDelimiterFilter now supports a splitOnNumerics
|
||||||
|
option, as well as a list of protected terms.
|
||||||
|
(Dan Rosher via hossman)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -140,6 +141,18 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
final int preserveOriginal;
|
final int preserveOriginal;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If 0, causes numeric changes to be ignored (subwords will only be generated
|
||||||
|
* given SUBWORD_DELIM tokens). (Defaults to 1)
|
||||||
|
*/
|
||||||
|
final int splitOnNumerics;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If not null is the set of tokens to protect from being delimited
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
final CharArraySet protWords;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param in Token stream to be filtered.
|
* @param in Token stream to be filtered.
|
||||||
|
@ -151,8 +164,10 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||||
|
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||||
|
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||||
*/
|
*/
|
||||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
|
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
|
||||||
super(in);
|
super(in);
|
||||||
this.generateWordParts = generateWordParts;
|
this.generateWordParts = generateWordParts;
|
||||||
this.generateNumberParts = generateNumberParts;
|
this.generateNumberParts = generateNumberParts;
|
||||||
|
@ -162,7 +177,22 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
this.splitOnCaseChange = splitOnCaseChange;
|
this.splitOnCaseChange = splitOnCaseChange;
|
||||||
this.preserveOriginal = preserveOriginal;
|
this.preserveOriginal = preserveOriginal;
|
||||||
this.charTypeTable = charTypeTable;
|
this.charTypeTable = charTypeTable;
|
||||||
|
this.splitOnNumerics = splitOnNumerics;
|
||||||
|
this.protWords = protWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compatibility constructor
|
||||||
|
*
|
||||||
|
* @deprecated Use
|
||||||
|
* {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, CharArraySet)}
|
||||||
|
* instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
|
||||||
|
this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, null);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param in Token stream to be filtered.
|
* @param in Token stream to be filtered.
|
||||||
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
|
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
|
||||||
|
@ -172,7 +202,20 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||||
|
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||||
|
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||||
*/
|
*/
|
||||||
|
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
|
||||||
|
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, protWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** * Compatibility constructor
|
||||||
|
*
|
||||||
|
* @deprecated Use
|
||||||
|
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||||
|
* instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
|
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
|
||||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
|
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
|
||||||
}
|
}
|
||||||
|
@ -180,23 +223,23 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
* Compatibility constructor
|
* Compatibility constructor
|
||||||
*
|
*
|
||||||
* @deprecated Use
|
* @deprecated Use
|
||||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)}
|
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||||
* instead.
|
* instead.
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
||||||
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
|
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Compatibility constructor
|
* Compatibility constructor
|
||||||
*
|
*
|
||||||
* @deprecated Use
|
* @deprecated Use
|
||||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)}
|
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||||
* instead.
|
* instead.
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
||||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
|
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
int charType(int ch) {
|
int charType(int ch) {
|
||||||
|
@ -273,6 +316,11 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
int start=0;
|
int start=0;
|
||||||
if (len ==0) continue;
|
if (len ==0) continue;
|
||||||
|
|
||||||
|
//skip protected tokens
|
||||||
|
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
origPosIncrement += t.getPositionIncrement();
|
origPosIncrement += t.getPositionIncrement();
|
||||||
|
|
||||||
// Avoid calling charType more than once for each char (basically
|
// Avoid calling charType more than once for each char (basically
|
||||||
|
@ -344,6 +392,9 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
|
|
||||||
} else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
|
} else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
|
||||||
// UPPER->LOWER: Don't split
|
// UPPER->LOWER: Don't split
|
||||||
|
} else if(splitOnNumerics == 0 &&
|
||||||
|
( ((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0) ) ) {
|
||||||
|
// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
|
||||||
} else {
|
} else {
|
||||||
// NOTE: this code currently assumes that only one flag
|
// NOTE: this code currently assumes that only one flag
|
||||||
// is set for each character now, so we don't have
|
// is set for each character now, so we don't have
|
||||||
|
|
|
@ -17,19 +17,59 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
|
||||||
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
import org.apache.solr.common.util.StrUtils;
|
||||||
|
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.List;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
|
public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
public static final String PROTECTED_TOKENS = "protected";
|
||||||
|
|
||||||
|
public void inform(ResourceLoader loader) {
|
||||||
|
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||||
|
if (wordFiles != null) {
|
||||||
|
try {
|
||||||
|
File protectedWordFiles = new File(wordFiles);
|
||||||
|
if (protectedWordFiles.exists()) {
|
||||||
|
List<String> wlist = loader.getLines(wordFiles);
|
||||||
|
//This cast is safe in Lucene
|
||||||
|
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
||||||
|
} else {
|
||||||
|
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||||
|
for (String file : files) {
|
||||||
|
List<String> wlist = loader.getLines(file.trim());
|
||||||
|
if (protectedWords == null)
|
||||||
|
protectedWords = new CharArraySet(wlist, false);
|
||||||
|
else
|
||||||
|
protectedWords.addAll(wlist);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private CharArraySet protectedWords = null;
|
||||||
|
|
||||||
int generateWordParts=0;
|
int generateWordParts=0;
|
||||||
int generateNumberParts=0;
|
int generateNumberParts=0;
|
||||||
int catenateWords=0;
|
int catenateWords=0;
|
||||||
int catenateNumbers=0;
|
int catenateNumbers=0;
|
||||||
int catenateAll=0;
|
int catenateAll=0;
|
||||||
int splitOnCaseChange=0;
|
int splitOnCaseChange=0;
|
||||||
|
int splitOnNumerics=0;
|
||||||
int preserveOriginal=0;
|
int preserveOriginal=0;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -41,6 +81,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
|
||||||
catenateNumbers = getInt("catenateNumbers", 0);
|
catenateNumbers = getInt("catenateNumbers", 0);
|
||||||
catenateAll = getInt("catenateAll", 0);
|
catenateAll = getInt("catenateAll", 0);
|
||||||
splitOnCaseChange = getInt("splitOnCaseChange", 1);
|
splitOnCaseChange = getInt("splitOnCaseChange", 1);
|
||||||
|
splitOnNumerics = getInt("splitOnNumerics", 1);
|
||||||
preserveOriginal = getInt("preserveOriginal", 0);
|
preserveOriginal = getInt("preserveOriginal", 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,6 +89,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
|
||||||
return new WordDelimiterFilter(input,
|
return new WordDelimiterFilter(input,
|
||||||
generateWordParts, generateNumberParts,
|
generateWordParts, generateNumberParts,
|
||||||
catenateWords, catenateNumbers, catenateAll,
|
catenateWords, catenateNumbers, catenateAll,
|
||||||
splitOnCaseChange, preserveOriginal);
|
splitOnCaseChange, preserveOriginal,
|
||||||
|
splitOnNumerics, protectedWords);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -278,4 +278,53 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
||||||
assertEquals(12, t.startOffset());
|
assertEquals(12, t.startOffset());
|
||||||
assertEquals(15, t.endOffset());
|
assertEquals(15, t.endOffset());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testAlphaNumericWords(){
|
||||||
|
assertU(adoc("id", "68","numericsubword","Java/J2SE"));
|
||||||
|
assertU(commit());
|
||||||
|
|
||||||
|
assertQ("j2se found",
|
||||||
|
req("numericsubword:(J2SE)")
|
||||||
|
,"//result[@numFound=1]"
|
||||||
|
);
|
||||||
|
assertQ("no j2 or se",
|
||||||
|
req("numericsubword:(J2 OR SE)")
|
||||||
|
,"//result[@numFound=0]"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testProtectedWords(){
|
||||||
|
assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
|
||||||
|
assertU(commit());
|
||||||
|
|
||||||
|
assertQ("java found",
|
||||||
|
req("protectedsubword:(java)")
|
||||||
|
,"//result[@numFound=1]"
|
||||||
|
);
|
||||||
|
|
||||||
|
assertQ(".net found",
|
||||||
|
req("protectedsubword:(.net)")
|
||||||
|
,"//result[@numFound=1]"
|
||||||
|
);
|
||||||
|
|
||||||
|
assertQ("c# found",
|
||||||
|
req("protectedsubword:(c#)")
|
||||||
|
,"//result[@numFound=1]"
|
||||||
|
);
|
||||||
|
|
||||||
|
assertQ("c++ found",
|
||||||
|
req("protectedsubword:(c++)")
|
||||||
|
,"//result[@numFound=1]"
|
||||||
|
);
|
||||||
|
|
||||||
|
assertQ("c found?",
|
||||||
|
req("protectedsubword:c")
|
||||||
|
,"//result[@numFound=0]"
|
||||||
|
);
|
||||||
|
assertQ("net found?",
|
||||||
|
req("protectedsubword:net")
|
||||||
|
,"//result[@numFound=0]"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,3 +18,6 @@
|
||||||
#to test, we will use words that would normally obviously be stemmed.
|
#to test, we will use words that would normally obviously be stemmed.
|
||||||
cats
|
cats
|
||||||
ridding
|
ridding
|
||||||
|
c#
|
||||||
|
c++
|
||||||
|
.net
|
||||||
|
|
|
@ -252,6 +252,36 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldtype>
|
</fieldtype>
|
||||||
|
|
||||||
|
<fieldtype name="numericsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
|
||||||
|
<analyzer type="index">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||||
|
<filter class="solr.StopFilterFactory"/>
|
||||||
|
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||||
|
<filter class="solr.StopFilterFactory"/>
|
||||||
|
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldtype>
|
||||||
|
|
||||||
|
<fieldtype name="protectedsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
|
||||||
|
<analyzer type="index">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldtype>
|
||||||
|
|
||||||
|
|
||||||
<!-- more flexible in matching skus, but more chance of a false match -->
|
<!-- more flexible in matching skus, but more chance of a false match -->
|
||||||
<fieldtype name="skutype1" class="solr.TextField">
|
<fieldtype name="skutype1" class="solr.TextField">
|
||||||
<analyzer type="index">
|
<analyzer type="index">
|
||||||
|
@ -387,6 +417,9 @@
|
||||||
<field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/>
|
<field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/>
|
||||||
|
|
||||||
<field name="subword" type="subword" indexed="true" stored="true"/>
|
<field name="subword" type="subword" indexed="true" stored="true"/>
|
||||||
|
<field name="numericsubword" type="numericsubword" indexed="true" stored="true"/>
|
||||||
|
<field name="protectedsubword" type="protectedsubword" indexed="true" stored="true"/>
|
||||||
|
|
||||||
<field name="sku1" type="skutype1" indexed="true" stored="true"/>
|
<field name="sku1" type="skutype1" indexed="true" stored="true"/>
|
||||||
<field name="sku2" type="skutype2" indexed="true" stored="true"/>
|
<field name="sku2" type="skutype2" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue