diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java index d53ea0a2608..b86df4d0e03 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java @@ -21,8 +21,9 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*; /** * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules. + * @lucene.internal */ -final class WordDelimiterIterator { +public final class WordDelimiterIterator { /** Indicates the end of iteration */ public static final int DONE = -1; @@ -270,6 +271,16 @@ final class WordDelimiterIterator { if (ch < charTypeTable.length) { return charTypeTable[ch]; } + return getType(ch); + } + + /** + * Computes the type of the given character + * + * @param ch Character whose type is to be determined + * @return Type of the character + */ + public static byte getType(int ch) { switch (Character.getType(ch)) { case Character.UPPERCASE_LETTER: return UPPER; case Character.LOWERCASE_LETTER: return LOWER; diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 841ab039b4b..b98f3bc081d 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -245,6 +245,10 @@ New Features * SOLR-1316: Create autosuggest component. (Ankul Garg, Jason Rutherglen, Shalin Shekhar Mangar, gsingers, Robert Muir, ab) + +* SOLR-2059: Add "types" attribute to WordDelimiterFilterFactory, which + allows you to customize how WordDelimiterFilter tokenizes text with + a configuration file. (Peter Karich, rmuir) Optimizations diff --git a/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java b/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java index cd903cf550d..712da5edb65 100644 --- a/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java @@ -16,14 +16,23 @@ */ package org.apache.solr.analysis; + import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.util.StrUtils; +import java.util.ArrayList; +import java.util.List; import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.io.IOException; @@ -32,7 +41,8 @@ import java.io.IOException; */ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public static final String PROTECTED_TOKENS = "protected"; - + public static final String TYPES = "types"; + public void inform(ResourceLoader loader) { String wordFiles = args.get(PROTECTED_TOKENS); if (wordFiles != null) { @@ -42,6 +52,20 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement throw new RuntimeException(e); } } + String types = args.get(TYPES); + if (types != null) { + try { + List files = StrUtils.splitFileNames( types ); + List wlist = new ArrayList(); + for( String file : files ){ + List lines = loader.getLines( file.trim() ); + wlist.addAll( lines ); + } + typeTable = parseTypes(wlist); + } catch (IOException e) { + throw new RuntimeException(e); + } + } } private CharArraySet protectedWords = null; @@ -55,6 +79,7 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement int splitOnNumerics=0; int preserveOriginal=0; int stemEnglishPossessive=0; + byte[] typeTable = null; @Override public void init(Map args) { @@ -71,10 +96,87 @@ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implement } public WordDelimiterFilter create(TokenStream input) { - return new WordDelimiterFilter(input, + return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protectedWords); } + + // source => type + private static Pattern typePattern = Pattern.compile( "(.*)\\s*=>\\s*(.*)\\s*$" ); + + /** parses a list of MappingCharFilter style rules into a custom byte[] type table */ + private byte[] parseTypes(List rules) { + SortedMap typeMap = new TreeMap(); + for( String rule : rules ){ + Matcher m = typePattern.matcher(rule); + if( !m.find() ) + throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]"); + String lhs = parseString(m.group(1).trim()); + Byte rhs = parseType(m.group(2).trim()); + if (lhs.length() != 1) + throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); + if (rhs == null) + throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); + typeMap.put(lhs.charAt(0), rhs); + } + + // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance + byte types[] = new byte[Math.max(typeMap.lastKey()+1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; + for (int i = 0; i < types.length; i++) + types[i] = WordDelimiterIterator.getType(i); + for (Map.Entry mapping : typeMap.entrySet()) + types[mapping.getKey()] = mapping.getValue(); + return types; + } + + private Byte parseType(String s) { + if (s.equals("LOWER")) + return WordDelimiterFilter.LOWER; + else if (s.equals("UPPER")) + return WordDelimiterFilter.UPPER; + else if (s.equals("ALPHA")) + return WordDelimiterFilter.ALPHA; + else if (s.equals("DIGIT")) + return WordDelimiterFilter.DIGIT; + else if (s.equals("ALPHANUM")) + return WordDelimiterFilter.ALPHANUM; + else if (s.equals("SUBWORD_DELIM")) + return WordDelimiterFilter.SUBWORD_DELIM; + else + return null; + } + + char[] out = new char[256]; + + private String parseString(String s){ + int readPos = 0; + int len = s.length(); + int writePos = 0; + while( readPos < len ){ + char c = s.charAt( readPos++ ); + if( c == '\\' ){ + if( readPos >= len ) + throw new RuntimeException( "Invalid escaped char in [" + s + "]" ); + c = s.charAt( readPos++ ); + switch( c ) { + case '\\' : c = '\\'; break; + case 'n' : c = '\n'; break; + case 't' : c = '\t'; break; + case 'r' : c = '\r'; break; + case 'b' : c = '\b'; break; + case 'f' : c = '\f'; break; + case 'u' : + if( readPos + 3 >= len ) + throw new RuntimeException( "Invalid escaped char in [" + s + "]" ); + c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 ); + readPos += 4; + break; + } + } + out[writePos++] = c; + } + return new String( out, 0, writePos ); + } } diff --git a/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java index 9759372f5df..c7abb822f83 100644 --- a/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java @@ -17,7 +17,15 @@ package org.apache.solr.analysis; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.core.SolrResourceLoader; import org.junit.BeforeClass; import org.junit.Test; @@ -183,4 +191,51 @@ public class TestWordDelimiterFilterFactory extends SolrTestCaseJ4 { ); clearIndex(); } + + @Test + public void testCustomTypes() throws Exception { + String testText = "I borrowed $5,400.00 at 25% interest-rate"; + WordDelimiterFilterFactory factoryDefault = new WordDelimiterFilterFactory(); + ResourceLoader loader = new SolrResourceLoader(null, null); + Map args = new HashMap(); + args.put("generateWordParts", "1"); + args.put("generateNumberParts", "1"); + args.put("catenateWords", "1"); + args.put("catenateNumbers", "1"); + args.put("catenateAll", "0"); + args.put("splitOnCaseChange", "1"); + + /* default behavior */ + factoryDefault.init(args); + factoryDefault.inform(loader); + + TokenStream ts = factoryDefault.create( + new WhitespaceTokenizer(BaseTokenTestCase.DEFAULT_VERSION, new StringReader(testText))); + BaseTokenTestCase.assertTokenStreamContents(ts, + new String[] { "I", "borrowed", "5", "400", "00", "540000", "at", "25", "interest", "rate", "interestrate" }); + + ts = factoryDefault.create( + new WhitespaceTokenizer(BaseTokenTestCase.DEFAULT_VERSION, new StringReader("foo\u200Dbar"))); + BaseTokenTestCase.assertTokenStreamContents(ts, + new String[] { "foo", "bar", "foobar" }); + + + /* custom behavior */ + WordDelimiterFilterFactory factoryCustom = new WordDelimiterFilterFactory(); + // use a custom type mapping + args.put("types", "wdftypes.txt"); + factoryCustom.init(args); + factoryCustom.inform(loader); + + ts = factoryCustom.create( + new WhitespaceTokenizer(BaseTokenTestCase.DEFAULT_VERSION, new StringReader(testText))); + BaseTokenTestCase.assertTokenStreamContents(ts, + new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "rate", "interestrate" }); + + /* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */ + ts = factoryCustom.create( + new WhitespaceTokenizer(BaseTokenTestCase.DEFAULT_VERSION, new StringReader("foo\u200Dbar"))); + BaseTokenTestCase.assertTokenStreamContents(ts, + new String[] { "foo\u200Dbar" }); + } } diff --git a/solr/src/test/test-files/solr/conf/wdftypes.txt b/solr/src/test/test-files/solr/conf/wdftypes.txt new file mode 100644 index 00000000000..7378b0802e7 --- /dev/null +++ b/solr/src/test/test-files/solr/conf/wdftypes.txt @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# A customized type mapping for WordDelimiterFilterFactory +# the allowable types are: LOWER, UPPER, ALPHA, DIGIT, ALPHANUM, SUBWORD_DELIM +# +# the default for any character without a mapping is always computed from +# Unicode character properties + +# Map the $, %, '.', and ',' characters to DIGIT +# This might be useful for financial data. +$ => DIGIT +% => DIGIT +. => DIGIT +\u002C => DIGIT + +# in some cases you might not want to split on ZWJ +# this also tests the case where we need a bigger byte[] +# see http://en.wikipedia.org/wiki/Zero-width_joiner +\u200D => ALPHANUM