From 39c1b9b3a5fee7e7896db94f1fbaada26d18cc4f Mon Sep 17 00:00:00 2001 From: "Chris M. Hostetter" Date: Thu, 18 Dec 2008 08:50:27 +0000 Subject: [PATCH] SOLR-876: WordDelimiterFilter splitOnNumerics and protwords git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@727677 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + .../solr/analysis/WordDelimiterFilter.java | 61 +++++++++++++++++-- .../analysis/WordDelimiterFilterFactory.java | 46 +++++++++++++- .../analysis/TestWordDelimiterFilter.java | 49 +++++++++++++++ src/test/test-files/solr/conf/protwords.txt | 3 + src/test/test-files/solr/conf/schema.xml | 33 ++++++++++ 6 files changed, 188 insertions(+), 7 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 84b182ab0b1..56fe7a872b8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -118,6 +118,9 @@ New Features optimized distributed faceting refinement by lowering parsing overhead and by making requests and responses smaller. +15. SOLR-876: WOrdDelimiterFilter now supports a splitOnNumerics + option, as well as a list of protected terms. + (Dan Rosher via hossman) Optimizations ---------------------- diff --git a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java index 2044e960e8b..1818b74c06e 100644 --- a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java +++ b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java @@ -20,6 +20,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.CharArraySet; import java.io.IOException; import java.util.ArrayList; @@ -140,6 +141,18 @@ final class WordDelimiterFilter extends TokenFilter { */ final int preserveOriginal; + /** + * If 0, causes numeric changes to be ignored (subwords will only be generated + * given SUBWORD_DELIM tokens). (Defaults to 1) + */ + final int splitOnNumerics; + + /** + * If not null is the set of tokens to protect from being delimited + * + */ + final CharArraySet protWords; + /** * * @param in Token stream to be filtered. @@ -151,8 +164,10 @@ final class WordDelimiterFilter extends TokenFilter { * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" + * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se" + * @param protWords If not null is the set of tokens to protect from being delimited */ - public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) { + public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) { super(in); this.generateWordParts = generateWordParts; this.generateNumberParts = generateNumberParts; @@ -162,7 +177,22 @@ final class WordDelimiterFilter extends TokenFilter { this.splitOnCaseChange = splitOnCaseChange; this.preserveOriginal = preserveOriginal; this.charTypeTable = charTypeTable; + this.splitOnNumerics = splitOnNumerics; + this.protWords = protWords; } + + /** + * Compatibility constructor + * + * @deprecated Use + * {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int, int, int, int, CharArraySet)} + * instead. + */ + @Deprecated + public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) { + this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, null); + } + /** * @param in Token stream to be filtered. * @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot" @@ -172,7 +202,20 @@ final class WordDelimiterFilter extends TokenFilter { * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" + * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se" + * @param protWords If not null is the set of tokens to protect from being delimited */ + public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) { + this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, protWords); + } + + /** * Compatibility constructor + * + * @deprecated Use + * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)} + * instead. + */ + @Deprecated public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) { this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal); } @@ -180,23 +223,23 @@ final class WordDelimiterFilter extends TokenFilter { * Compatibility constructor * * @deprecated Use - * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)} + * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)} * instead. */ @Deprecated public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) { - this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0); + this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null); } /** * Compatibility constructor * * @deprecated Use - * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int)} + * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int, int, int, CharArraySet)} * instead. */ @Deprecated public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) { - this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0); + this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null); } int charType(int ch) { @@ -273,6 +316,11 @@ final class WordDelimiterFilter extends TokenFilter { int start=0; if (len ==0) continue; + //skip protected tokens + if (protWords != null && protWords.contains(termBuffer, 0, len)) { + return t; + } + origPosIncrement += t.getPositionIncrement(); // Avoid calling charType more than once for each char (basically @@ -344,6 +392,9 @@ final class WordDelimiterFilter extends TokenFilter { } else if ((lastType & UPPER)!=0 && (type & LOWER)!=0) { // UPPER->LOWER: Don't split + } else if(splitOnNumerics == 0 && + ( ((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0) ) ) { + // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split } else { // NOTE: this code currently assumes that only one flag // is set for each character now, so we don't have diff --git a/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java b/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java index 132ae993c8d..3939e1348a0 100644 --- a/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java +++ b/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java @@ -17,19 +17,59 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.CharArraySet; + +import org.apache.solr.util.plugin.ResourceLoaderAware; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.util.StrUtils; + import java.util.Map; +import java.io.File; +import java.util.List; +import java.io.IOException; + /** * @version $Id$ */ -public class WordDelimiterFilterFactory extends BaseTokenFilterFactory { +public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + public static final String PROTECTED_TOKENS = "protected"; + + public void inform(ResourceLoader loader) { + String wordFiles = args.get(PROTECTED_TOKENS); + if (wordFiles != null) { + try { + File protectedWordFiles = new File(wordFiles); + if (protectedWordFiles.exists()) { + List wlist = loader.getLines(wordFiles); + //This cast is safe in Lucene + protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally + } else { + List files = StrUtils.splitFileNames(wordFiles); + for (String file : files) { + List wlist = loader.getLines(file.trim()); + if (protectedWords == null) + protectedWords = new CharArraySet(wlist, false); + else + protectedWords.addAll(wlist); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + private CharArraySet protectedWords = null; + int generateWordParts=0; int generateNumberParts=0; int catenateWords=0; int catenateNumbers=0; int catenateAll=0; int splitOnCaseChange=0; + int splitOnNumerics=0; int preserveOriginal=0; @Override @@ -41,6 +81,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory { catenateNumbers = getInt("catenateNumbers", 0); catenateAll = getInt("catenateAll", 0); splitOnCaseChange = getInt("splitOnCaseChange", 1); + splitOnNumerics = getInt("splitOnNumerics", 1); preserveOriginal = getInt("preserveOriginal", 0); } @@ -48,6 +89,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory { return new WordDelimiterFilter(input, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, - splitOnCaseChange, preserveOriginal); + splitOnCaseChange, preserveOriginal, + splitOnNumerics, protectedWords); } } diff --git a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java index b609d983876..dda71ea173e 100644 --- a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java +++ b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java @@ -278,4 +278,53 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase { assertEquals(12, t.startOffset()); assertEquals(15, t.endOffset()); } + + public void testAlphaNumericWords(){ + assertU(adoc("id", "68","numericsubword","Java/J2SE")); + assertU(commit()); + + assertQ("j2se found", + req("numericsubword:(J2SE)") + ,"//result[@numFound=1]" + ); + assertQ("no j2 or se", + req("numericsubword:(J2 OR SE)") + ,"//result[@numFound=0]" + ); + } + + public void testProtectedWords(){ + assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE")); + assertU(commit()); + + assertQ("java found", + req("protectedsubword:(java)") + ,"//result[@numFound=1]" + ); + + assertQ(".net found", + req("protectedsubword:(.net)") + ,"//result[@numFound=1]" + ); + + assertQ("c# found", + req("protectedsubword:(c#)") + ,"//result[@numFound=1]" + ); + + assertQ("c++ found", + req("protectedsubword:(c++)") + ,"//result[@numFound=1]" + ); + + assertQ("c found?", + req("protectedsubword:c") + ,"//result[@numFound=0]" + ); + assertQ("net found?", + req("protectedsubword:net") + ,"//result[@numFound=0]" + ); + } + } diff --git a/src/test/test-files/solr/conf/protwords.txt b/src/test/test-files/solr/conf/protwords.txt index 073630bef44..f896cfae30a 100644 --- a/src/test/test-files/solr/conf/protwords.txt +++ b/src/test/test-files/solr/conf/protwords.txt @@ -18,3 +18,6 @@ #to test, we will use words that would normally obviously be stemmed. cats ridding +c# +c++ +.net diff --git a/src/test/test-files/solr/conf/schema.xml b/src/test/test-files/solr/conf/schema.xml index c1df70ca929..50c0f421447 100644 --- a/src/test/test-files/solr/conf/schema.xml +++ b/src/test/test-files/solr/conf/schema.xml @@ -252,6 +252,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -387,6 +417,9 @@ + + +