mirror of https://github.com/apache/lucene.git
SOLR-876: WordDelimiterFilter splitOnNumerics and protwords
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@727677 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
663f27bd80
commit
39c1b9b3a5
|
@ -118,6 +118,9 @@ New Features
|
|||
optimized distributed faceting refinement by lowering parsing overhead and
|
||||
by making requests and responses smaller.
|
||||
|
||||
15. SOLR-876: WOrdDelimiterFilter now supports a splitOnNumerics
|
||||
option, as well as a list of protected terms.
|
||||
(Dan Rosher via hossman)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
|
@ -140,6 +141,18 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
*/
|
||||
final int preserveOriginal;
|
||||
|
||||
/**
|
||||
* If 0, causes numeric changes to be ignored (subwords will only be generated
|
||||
* given SUBWORD_DELIM tokens). (Defaults to 1)
|
||||
*/
|
||||
final int splitOnNumerics;
|
||||
|
||||
/**
|
||||
* If not null is the set of tokens to protect from being delimited
|
||||
*
|
||||
*/
|
||||
final CharArraySet protWords;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param in Token stream to be filtered.
|
||||
|
@ -151,8 +164,10 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
|
||||
super(in);
|
||||
this.generateWordParts = generateWordParts;
|
||||
this.generateNumberParts = generateNumberParts;
|
||||
|
@ -162,7 +177,22 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
this.splitOnCaseChange = splitOnCaseChange;
|
||||
this.preserveOriginal = preserveOriginal;
|
||||
this.charTypeTable = charTypeTable;
|
||||
this.splitOnNumerics = splitOnNumerics;
|
||||
this.protWords = protWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compatibility constructor
|
||||
*
|
||||
* @deprecated Use
|
||||
* {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
|
||||
this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param in Token stream to be filtered.
|
||||
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
|
||||
|
@ -172,7 +202,20 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, protWords);
|
||||
}
|
||||
|
||||
/** * Compatibility constructor
|
||||
*
|
||||
* @deprecated Use
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
|
||||
}
|
||||
|
@ -180,23 +223,23 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
* Compatibility constructor
|
||||
*
|
||||
* @deprecated Use
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)}
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
||||
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
|
||||
this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
|
||||
}
|
||||
/**
|
||||
* Compatibility constructor
|
||||
*
|
||||
* @deprecated Use
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)}
|
||||
* {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)}
|
||||
* instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0);
|
||||
this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
|
||||
}
|
||||
|
||||
int charType(int ch) {
|
||||
|
@ -273,6 +316,11 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
int start=0;
|
||||
if (len ==0) continue;
|
||||
|
||||
//skip protected tokens
|
||||
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
||||
return t;
|
||||
}
|
||||
|
||||
origPosIncrement += t.getPositionIncrement();
|
||||
|
||||
// Avoid calling charType more than once for each char (basically
|
||||
|
@ -344,6 +392,9 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
|
||||
} else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) {
|
||||
// UPPER->LOWER: Don't split
|
||||
} else if(splitOnNumerics == 0 &&
|
||||
( ((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0) ) ) {
|
||||
// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
|
||||
} else {
|
||||
// NOTE: this code currently assumes that only one flag
|
||||
// is set for each character now, so we don't have
|
||||
|
|
|
@ -17,19 +17,59 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
|
||||
public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String PROTECTED_TOKENS = "protected";
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||
if (wordFiles != null) {
|
||||
try {
|
||||
File protectedWordFiles = new File(wordFiles);
|
||||
if (protectedWordFiles.exists()) {
|
||||
List<String> wlist = loader.getLines(wordFiles);
|
||||
//This cast is safe in Lucene
|
||||
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(wordFiles);
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
if (protectedWords == null)
|
||||
protectedWords = new CharArraySet(wlist, false);
|
||||
else
|
||||
protectedWords.addAll(wlist);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private CharArraySet protectedWords = null;
|
||||
|
||||
int generateWordParts=0;
|
||||
int generateNumberParts=0;
|
||||
int catenateWords=0;
|
||||
int catenateNumbers=0;
|
||||
int catenateAll=0;
|
||||
int splitOnCaseChange=0;
|
||||
int splitOnNumerics=0;
|
||||
int preserveOriginal=0;
|
||||
|
||||
@Override
|
||||
|
@ -41,6 +81,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
|
|||
catenateNumbers = getInt("catenateNumbers", 0);
|
||||
catenateAll = getInt("catenateAll", 0);
|
||||
splitOnCaseChange = getInt("splitOnCaseChange", 1);
|
||||
splitOnNumerics = getInt("splitOnNumerics", 1);
|
||||
preserveOriginal = getInt("preserveOriginal", 0);
|
||||
}
|
||||
|
||||
|
@ -48,6 +89,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory {
|
|||
return new WordDelimiterFilter(input,
|
||||
generateWordParts, generateNumberParts,
|
||||
catenateWords, catenateNumbers, catenateAll,
|
||||
splitOnCaseChange, preserveOriginal);
|
||||
splitOnCaseChange, preserveOriginal,
|
||||
splitOnNumerics, protectedWords);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -278,4 +278,53 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
assertEquals(12, t.startOffset());
|
||||
assertEquals(15, t.endOffset());
|
||||
}
|
||||
|
||||
public void testAlphaNumericWords(){
|
||||
assertU(adoc("id", "68","numericsubword","Java/J2SE"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("j2se found",
|
||||
req("numericsubword:(J2SE)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
assertQ("no j2 or se",
|
||||
req("numericsubword:(J2 OR SE)")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
}
|
||||
|
||||
public void testProtectedWords(){
|
||||
assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("java found",
|
||||
req("protectedsubword:(java)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ(".net found",
|
||||
req("protectedsubword:(.net)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("c# found",
|
||||
req("protectedsubword:(c#)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("c++ found",
|
||||
req("protectedsubword:(c++)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("c found?",
|
||||
req("protectedsubword:c")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
assertQ("net found?",
|
||||
req("protectedsubword:net")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,3 +18,6 @@
|
|||
#to test, we will use words that would normally obviously be stemmed.
|
||||
cats
|
||||
ridding
|
||||
c#
|
||||
c++
|
||||
.net
|
||||
|
|
|
@ -252,6 +252,36 @@
|
|||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="numericsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||
<filter class="solr.StopFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.StopFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="protectedsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
|
||||
<!-- more flexible in matching skus, but more chance of a false match -->
|
||||
<fieldtype name="skutype1" class="solr.TextField">
|
||||
<analyzer type="index">
|
||||
|
@ -387,6 +417,9 @@
|
|||
<field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/>
|
||||
|
||||
<field name="subword" type="subword" indexed="true" stored="true"/>
|
||||
<field name="numericsubword" type="numericsubword" indexed="true" stored="true"/>
|
||||
<field name="protectedsubword" type="protectedsubword" indexed="true" stored="true"/>
|
||||
|
||||
<field name="sku1" type="skutype1" indexed="true" stored="true"/>
|
||||
<field name="sku2" type="skutype2" indexed="true" stored="true"/>
|
||||
|
||||
|
|
Loading…
Reference in New Issue