mirror of https://github.com/apache/lucene.git
SOLR-319: Changed SynonymFilterFactory to "tokenize" synonyms file.
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@657829 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f4e41c8c8f
commit
455e7a414e
|
@ -257,6 +257,15 @@ New Features
|
|||
|
||||
48. SOLR-537: Use of hl.maxAlternateFieldLength parameter from solr-ruby
|
||||
(koji)
|
||||
|
||||
49. SOLR-319: Changed SynonymFilterFactory to "tokenize" synonyms file.
|
||||
To use a tokenizer, specify "tokenizerFactory" attribute in <filter>.
|
||||
For example:
|
||||
<tokenizer class="solr.CJKTokenizerFactory"/>
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" expand="true"
|
||||
ignoreCase="true" tokenizerFactory="solr.CJKTokenizerFactory"/>
|
||||
(koji)
|
||||
|
||||
|
||||
Changes in runtime behavior
|
||||
1. SOLR-559: use Lucene updateDocument, deleteDocuments methods. This
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
|
@ -24,8 +25,11 @@ import org.apache.solr.core.SolrCore;
|
|||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
|
@ -38,6 +42,12 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
|
|||
boolean ignoreCase = getBoolean("ignoreCase", false);
|
||||
boolean expand = getBoolean("expand", true);
|
||||
|
||||
String tf = args.get("tokenizerFactory");
|
||||
TokenizerFactory tokFactory = null;
|
||||
if( tf != null ){
|
||||
tokFactory = loadTokenizerFactory( loader, tf, args );
|
||||
}
|
||||
|
||||
if (synonyms != null) {
|
||||
List<String> wlist=null;
|
||||
try {
|
||||
|
@ -46,7 +56,7 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
|
|||
throw new RuntimeException(e);
|
||||
}
|
||||
synMap = new SynonymMap(ignoreCase);
|
||||
parseRules(wlist, synMap, "=>", ",", expand);
|
||||
parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
|
||||
if (wlist.size()<=20) {
|
||||
SolrCore.log.fine("SynonymMap "+synonyms +":"+synMap);
|
||||
}
|
||||
|
@ -55,7 +65,8 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
|
|||
|
||||
private SynonymMap synMap;
|
||||
|
||||
private static void parseRules(List<String> rules, SynonymMap map, String mappingSep, String synSep, boolean expansion) {
|
||||
static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
|
||||
String synSep, boolean expansion, TokenizerFactory tokFactory) {
|
||||
int count=0;
|
||||
for (String rule : rules) {
|
||||
// To use regexes, we need an expression that specifies an odd number of chars.
|
||||
|
@ -71,10 +82,10 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
|
|||
if (mapping.size() > 2) {
|
||||
throw new RuntimeException("Invalid Synonym Rule:" + rule);
|
||||
} else if (mapping.size()==2) {
|
||||
source = getSynList(mapping.get(0), synSep);
|
||||
target = getSynList(mapping.get(1), synSep);
|
||||
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||
target = getSynList(mapping.get(1), synSep, tokFactory);
|
||||
} else {
|
||||
source = getSynList(mapping.get(0), synSep);
|
||||
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||
if (expansion) {
|
||||
// expand to all arguments
|
||||
target = source;
|
||||
|
@ -100,21 +111,48 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso
|
|||
}
|
||||
|
||||
// a , b c , d e f => [[a],[b,c],[d,e,f]]
|
||||
private static List<List<String>> getSynList(String str, String separator) {
|
||||
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
|
||||
List<String> strList = StrUtils.splitSmart(str, separator, false);
|
||||
// now split on whitespace to get a list of token strings
|
||||
List<List<String>> synList = new ArrayList<List<String>>();
|
||||
for (String toks : strList) {
|
||||
List<String> tokList = StrUtils.splitWS(toks, true);
|
||||
List<String> tokList = tokFactory == null ?
|
||||
StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
|
||||
synList.add(tokList);
|
||||
}
|
||||
return synList;
|
||||
}
|
||||
|
||||
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
|
||||
StringReader reader = new StringReader( source );
|
||||
TokenStream ts = loadTokenizer(tokFactory, reader);
|
||||
List<String> tokList = new ArrayList<String>();
|
||||
try {
|
||||
for( Token token = ts.next(); token != null; token = ts.next() ){
|
||||
String text = token.termText();
|
||||
if( text.length() > 0 )
|
||||
tokList.add( text );
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
finally{
|
||||
reader.close();
|
||||
}
|
||||
return tokList;
|
||||
}
|
||||
|
||||
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
|
||||
TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
|
||||
tokFactory.init( args );
|
||||
return tokFactory;
|
||||
}
|
||||
|
||||
private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
|
||||
return tokFactory.create( reader );
|
||||
}
|
||||
|
||||
public SynonymFilter create(TokenStream input) {
|
||||
return new SynonymFilter(input,synMap);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,271 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
public class TestSynonymMap extends AnalysisTestCase {
|
||||
|
||||
public void testInvalidMappingRules() throws Exception {
|
||||
SynonymMap synMap = new SynonymMap( true );
|
||||
List<String> rules = new ArrayList<String>( 1 );
|
||||
rules.add( "a=>b=>c" );
|
||||
try{
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
fail( "RuntimeException must be thrown." );
|
||||
}
|
||||
catch( RuntimeException expected ){}
|
||||
}
|
||||
|
||||
public void testReadMappingRules() throws Exception {
|
||||
SynonymMap synMap;
|
||||
|
||||
// (a)->[b]
|
||||
List<String> rules = new ArrayList<String>();
|
||||
rules.add( "a=>b" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 1, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "b" );
|
||||
|
||||
// (a)->[c]
|
||||
// (b)->[c]
|
||||
rules.clear();
|
||||
rules.add( "a,b=>c" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "c" );
|
||||
assertTokIncludes( synMap, "b", "c" );
|
||||
|
||||
// (a)->[b][c]
|
||||
rules.clear();
|
||||
rules.add( "a=>b,c" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 1, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "b" );
|
||||
assertTokIncludes( synMap, "a", "c" );
|
||||
|
||||
// (a)->(b)->[a2]
|
||||
// [a1]
|
||||
rules.clear();
|
||||
rules.add( "a=>a1" );
|
||||
rules.add( "a b=>a2" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 1, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a1" );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a" ), "b", "a2" );
|
||||
|
||||
// (a)->(b)->[a2]
|
||||
// (c)->[a3]
|
||||
// [a1]
|
||||
rules.clear();
|
||||
rules.add( "a=>a1" );
|
||||
rules.add( "a b=>a2" );
|
||||
rules.add( "a c=>a3" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 1, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a1" );
|
||||
assertEquals( 2, getSubSynonymMap( synMap, "a" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a" ), "b", "a2" );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a" ), "c", "a3" );
|
||||
|
||||
// (a)->(b)->[a2]
|
||||
// [a1]
|
||||
// (b)->(c)->[b2]
|
||||
// [b1]
|
||||
rules.clear();
|
||||
rules.add( "a=>a1" );
|
||||
rules.add( "a b=>a2" );
|
||||
rules.add( "b=>b1" );
|
||||
rules.add( "b c=>b2" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a1" );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a" ), "b", "a2" );
|
||||
assertTokIncludes( synMap, "b", "b1" );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "b" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "b" ), "c", "b2" );
|
||||
}
|
||||
|
||||
public void testRead1waySynonymRules() throws Exception {
|
||||
SynonymMap synMap;
|
||||
|
||||
// (a)->[a]
|
||||
// (b)->[a]
|
||||
List<String> rules = new ArrayList<String>();
|
||||
rules.add( "a,b" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "b", "a" );
|
||||
|
||||
// (a)->[a]
|
||||
// (b)->[a]
|
||||
// (c)->[a]
|
||||
rules.clear();
|
||||
rules.add( "a,b,c" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
assertEquals( 3, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "b", "a" );
|
||||
assertTokIncludes( synMap, "c", "a" );
|
||||
|
||||
// (a)->[a]
|
||||
// (b1)->(b2)->[a]
|
||||
rules.clear();
|
||||
rules.add( "a,b1 b2" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "a" );
|
||||
|
||||
// (a1)->(a2)->[a1][a2]
|
||||
// (b)->[a1][a2]
|
||||
rules.clear();
|
||||
rules.add( "a1 a2,b" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a2" );
|
||||
assertTokIncludes( synMap, "b", "a1" );
|
||||
assertTokIncludes( synMap, "b", "a2" );
|
||||
}
|
||||
|
||||
public void testRead2waySynonymRules() throws Exception {
|
||||
SynonymMap synMap;
|
||||
|
||||
// (a)->[a][b]
|
||||
// (b)->[a][b]
|
||||
List<String> rules = new ArrayList<String>();
|
||||
rules.add( "a,b" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "a", "b" );
|
||||
assertTokIncludes( synMap, "b", "a" );
|
||||
assertTokIncludes( synMap, "b", "b" );
|
||||
|
||||
// (a)->[a][b][c]
|
||||
// (b)->[a][b][c]
|
||||
// (c)->[a][b][c]
|
||||
rules.clear();
|
||||
rules.add( "a,b,c" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 3, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "a", "b" );
|
||||
assertTokIncludes( synMap, "a", "c" );
|
||||
assertTokIncludes( synMap, "b", "a" );
|
||||
assertTokIncludes( synMap, "b", "b" );
|
||||
assertTokIncludes( synMap, "b", "c" );
|
||||
assertTokIncludes( synMap, "c", "a" );
|
||||
assertTokIncludes( synMap, "c", "b" );
|
||||
assertTokIncludes( synMap, "c", "c" );
|
||||
|
||||
// (a)->[a]
|
||||
// [b1][b2]
|
||||
// (b1)->(b2)->[a]
|
||||
// [b1][b2]
|
||||
rules.clear();
|
||||
rules.add( "a,b1 b2" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "a", "b1" );
|
||||
assertTokIncludes( synMap, "a", "b2" );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "a" );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "b1" );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "b1" ), "b2", "b2" );
|
||||
|
||||
// (a1)->(a2)->[a1][a2]
|
||||
// [b]
|
||||
// (b)->[a1][a2]
|
||||
// [b]
|
||||
rules.clear();
|
||||
rules.add( "a1 a2,b" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a2" );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "b" );
|
||||
assertTokIncludes( synMap, "b", "a1" );
|
||||
assertTokIncludes( synMap, "b", "a2" );
|
||||
assertTokIncludes( synMap, "b", "b" );
|
||||
}
|
||||
|
||||
public void testBigramTokenizer() throws Exception {
|
||||
SynonymMap synMap;
|
||||
|
||||
// prepare bi-gram tokenizer factory
|
||||
BaseTokenizerFactory tf = new NGramTokenizerFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put("minGramSize","2");
|
||||
args.put("maxGramSize","2");
|
||||
tf.init( args );
|
||||
|
||||
// (ab)->(bc)->(cd)->[ef][fg][gh]
|
||||
List<String> rules = new ArrayList<String>();
|
||||
rules.add( "abcd=>efgh" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
|
||||
assertEquals( 1, synMap.submap.size() );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
|
||||
assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" );
|
||||
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" );
|
||||
assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" );
|
||||
}
|
||||
|
||||
private void assertTokIncludes( SynonymMap map, String src, String exp ) throws Exception {
|
||||
Token[] tokens = ((SynonymMap)map.submap.get( src )).synonyms;
|
||||
boolean inc = false;
|
||||
for( Token token : tokens ){
|
||||
if( exp.equals( token.termText() ) )
|
||||
inc = true;
|
||||
}
|
||||
assertTrue( inc );
|
||||
}
|
||||
|
||||
private SynonymMap getSubSynonymMap( SynonymMap map, String src ){
|
||||
return (SynonymMap)map.submap.get( src );
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue