diff --git a/CHANGES.txt b/CHANGES.txt index 42d665ff891..418d2e6a5e1 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -257,6 +257,15 @@ New Features 48. SOLR-537: Use of hl.maxAlternateFieldLength parameter from solr-ruby (koji) + +49. SOLR-319: Changed SynonymFilterFactory to "tokenize" synonyms file. + To use a tokenizer, specify "tokenizerFactory" attribute in . + For example: + + + (koji) + Changes in runtime behavior 1. SOLR-559: use Lucene updateDocument, deleteDocuments methods. This diff --git a/src/java/org/apache/solr/analysis/SynonymFilterFactory.java b/src/java/org/apache/solr/analysis/SynonymFilterFactory.java index c4a0cec559e..ea0afbad3ca 100644 --- a/src/java/org/apache/solr/analysis/SynonymFilterFactory.java +++ b/src/java/org/apache/solr/analysis/SynonymFilterFactory.java @@ -17,6 +17,7 @@ package org.apache.solr.analysis; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.util.StrUtils; @@ -24,8 +25,11 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.util.plugin.ResourceLoaderAware; import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** * @version $Id$ @@ -38,6 +42,12 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso boolean ignoreCase = getBoolean("ignoreCase", false); boolean expand = getBoolean("expand", true); + String tf = args.get("tokenizerFactory"); + TokenizerFactory tokFactory = null; + if( tf != null ){ + tokFactory = loadTokenizerFactory( loader, tf, args ); + } + if (synonyms != null) { List wlist=null; try { @@ -46,7 +56,7 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso throw new RuntimeException(e); } synMap = new SynonymMap(ignoreCase); - parseRules(wlist, synMap, "=>", ",", expand); + parseRules(wlist, synMap, "=>", ",", expand,tokFactory); if (wlist.size()<=20) { SolrCore.log.fine("SynonymMap "+synonyms +":"+synMap); } @@ -55,7 +65,8 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso private SynonymMap synMap; - private static void parseRules(List rules, SynonymMap map, String mappingSep, String synSep, boolean expansion) { + static void parseRules(List rules, SynonymMap map, String mappingSep, + String synSep, boolean expansion, TokenizerFactory tokFactory) { int count=0; for (String rule : rules) { // To use regexes, we need an expression that specifies an odd number of chars. @@ -71,10 +82,10 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso if (mapping.size() > 2) { throw new RuntimeException("Invalid Synonym Rule:" + rule); } else if (mapping.size()==2) { - source = getSynList(mapping.get(0), synSep); - target = getSynList(mapping.get(1), synSep); + source = getSynList(mapping.get(0), synSep, tokFactory); + target = getSynList(mapping.get(1), synSep, tokFactory); } else { - source = getSynList(mapping.get(0), synSep); + source = getSynList(mapping.get(0), synSep, tokFactory); if (expansion) { // expand to all arguments target = source; @@ -100,21 +111,48 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso } // a , b c , d e f => [[a],[b,c],[d,e,f]] - private static List> getSynList(String str, String separator) { + private static List> getSynList(String str, String separator, TokenizerFactory tokFactory) { List strList = StrUtils.splitSmart(str, separator, false); // now split on whitespace to get a list of token strings List> synList = new ArrayList>(); for (String toks : strList) { - List tokList = StrUtils.splitWS(toks, true); + List tokList = tokFactory == null ? + StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory); synList.add(tokList); } return synList; } + + private static List splitByTokenizer(String source, TokenizerFactory tokFactory){ + StringReader reader = new StringReader( source ); + TokenStream ts = loadTokenizer(tokFactory, reader); + List tokList = new ArrayList(); + try { + for( Token token = ts.next(); token != null; token = ts.next() ){ + String text = token.termText(); + if( text.length() > 0 ) + tokList.add( text ); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + finally{ + reader.close(); + } + return tokList; + } + private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){ + TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname ); + tokFactory.init( args ); + return tokFactory; + } + + private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){ + return tokFactory.create( reader ); + } public SynonymFilter create(TokenStream input) { return new SynonymFilter(input,synMap); } - - } diff --git a/src/test/org/apache/solr/analysis/TestSynonymMap.java b/src/test/org/apache/solr/analysis/TestSynonymMap.java new file mode 100644 index 00000000000..fb627db56a4 --- /dev/null +++ b/src/test/org/apache/solr/analysis/TestSynonymMap.java @@ -0,0 +1,271 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.Token; + +public class TestSynonymMap extends AnalysisTestCase { + + public void testInvalidMappingRules() throws Exception { + SynonymMap synMap = new SynonymMap( true ); + List rules = new ArrayList( 1 ); + rules.add( "a=>b=>c" ); + try{ + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + fail( "RuntimeException must be thrown." ); + } + catch( RuntimeException expected ){} + } + + public void testReadMappingRules() throws Exception { + SynonymMap synMap; + + // (a)->[b] + List rules = new ArrayList(); + rules.add( "a=>b" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + assertEquals( 1, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "b" ); + + // (a)->[c] + // (b)->[c] + rules.clear(); + rules.add( "a,b=>c" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + assertEquals( 2, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "c" ); + assertTokIncludes( synMap, "b", "c" ); + + // (a)->[b][c] + rules.clear(); + rules.add( "a=>b,c" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + assertEquals( 1, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "b" ); + assertTokIncludes( synMap, "a", "c" ); + + // (a)->(b)->[a2] + // [a1] + rules.clear(); + rules.add( "a=>a1" ); + rules.add( "a b=>a2" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + assertEquals( 1, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "a1" ); + assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() ); + assertTokIncludes( getSubSynonymMap( synMap, "a" ), "b", "a2" ); + + // (a)->(b)->[a2] + // (c)->[a3] + // [a1] + rules.clear(); + rules.add( "a=>a1" ); + rules.add( "a b=>a2" ); + rules.add( "a c=>a3" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + assertEquals( 1, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "a1" ); + assertEquals( 2, getSubSynonymMap( synMap, "a" ).submap.size() ); + assertTokIncludes( getSubSynonymMap( synMap, "a" ), "b", "a2" ); + assertTokIncludes( getSubSynonymMap( synMap, "a" ), "c", "a3" ); + + // (a)->(b)->[a2] + // [a1] + // (b)->(c)->[b2] + // [b1] + rules.clear(); + rules.add( "a=>a1" ); + rules.add( "a b=>a2" ); + rules.add( "b=>b1" ); + rules.add( "b c=>b2" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + assertEquals( 2, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "a1" ); + assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() ); + assertTokIncludes( getSubSynonymMap( synMap, "a" ), "b", "a2" ); + assertTokIncludes( synMap, "b", "b1" ); + assertEquals( 1, getSubSynonymMap( synMap, "b" ).submap.size() ); + assertTokIncludes( getSubSynonymMap( synMap, "b" ), "c", "b2" ); + } + + public void testRead1waySynonymRules() throws Exception { + SynonymMap synMap; + + // (a)->[a] + // (b)->[a] + List rules = new ArrayList(); + rules.add( "a,b" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); + assertEquals( 2, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "a" ); + assertTokIncludes( synMap, "b", "a" ); + + // (a)->[a] + // (b)->[a] + // (c)->[a] + rules.clear(); + rules.add( "a,b,c" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); + assertEquals( 3, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "a" ); + assertTokIncludes( synMap, "b", "a" ); + assertTokIncludes( synMap, "c", "a" ); + + // (a)->[a] + // (b1)->(b2)->[a] + rules.clear(); + rules.add( "a,b1 b2" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); + assertEquals( 2, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "a" ); + assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() ); + assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "a" ); + + // (a1)->(a2)->[a1][a2] + // (b)->[a1][a2] + rules.clear(); + rules.add( "a1 a2,b" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); + assertEquals( 2, synMap.submap.size() ); + assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() ); + assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" ); + assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a2" ); + assertTokIncludes( synMap, "b", "a1" ); + assertTokIncludes( synMap, "b", "a2" ); + } + + public void testRead2waySynonymRules() throws Exception { + SynonymMap synMap; + + // (a)->[a][b] + // (b)->[a][b] + List rules = new ArrayList(); + rules.add( "a,b" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + assertEquals( 2, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "a" ); + assertTokIncludes( synMap, "a", "b" ); + assertTokIncludes( synMap, "b", "a" ); + assertTokIncludes( synMap, "b", "b" ); + + // (a)->[a][b][c] + // (b)->[a][b][c] + // (c)->[a][b][c] + rules.clear(); + rules.add( "a,b,c" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + assertEquals( 3, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "a" ); + assertTokIncludes( synMap, "a", "b" ); + assertTokIncludes( synMap, "a", "c" ); + assertTokIncludes( synMap, "b", "a" ); + assertTokIncludes( synMap, "b", "b" ); + assertTokIncludes( synMap, "b", "c" ); + assertTokIncludes( synMap, "c", "a" ); + assertTokIncludes( synMap, "c", "b" ); + assertTokIncludes( synMap, "c", "c" ); + + // (a)->[a] + // [b1][b2] + // (b1)->(b2)->[a] + // [b1][b2] + rules.clear(); + rules.add( "a,b1 b2" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + assertEquals( 2, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "a" ); + assertTokIncludes( synMap, "a", "b1" ); + assertTokIncludes( synMap, "a", "b2" ); + assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() ); + assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "a" ); + assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "b1" ); + assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "b2" ); + + // (a1)->(a2)->[a1][a2] + // [b] + // (b)->[a1][a2] + // [b] + rules.clear(); + rules.add( "a1 a2,b" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + assertEquals( 2, synMap.submap.size() ); + assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() ); + assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" ); + assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a2" ); + assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "b" ); + assertTokIncludes( synMap, "b", "a1" ); + assertTokIncludes( synMap, "b", "a2" ); + assertTokIncludes( synMap, "b", "b" ); + } + + public void testBigramTokenizer() throws Exception { + SynonymMap synMap; + + // prepare bi-gram tokenizer factory + BaseTokenizerFactory tf = new NGramTokenizerFactory(); + Map args = new HashMap(); + args.put("minGramSize","2"); + args.put("maxGramSize","2"); + tf.init( args ); + + // (ab)->(bc)->(cd)->[ef][fg][gh] + List rules = new ArrayList(); + rules.add( "abcd=>efgh" ); + synMap = new SynonymMap( true ); + SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); + assertEquals( 1, synMap.submap.size() ); + assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() ); + assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() ); + assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" ); + assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" ); + assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" ); + } + + private void assertTokIncludes( SynonymMap map, String src, String exp ) throws Exception { + Token[] tokens = ((SynonymMap)map.submap.get( src )).synonyms; + boolean inc = false; + for( Token token : tokens ){ + if( exp.equals( token.termText() ) ) + inc = true; + } + assertTrue( inc ); + } + + private SynonymMap getSubSynonymMap( SynonymMap map, String src ){ + return (SynonymMap)map.submap.get( src ); + } +}