diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f149c45443f..ffb94a0fea0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -66,6 +66,9 @@ New Features for example if a StopFilter had removed the last token. (Mike McCandless) +* LUCENE-5219: Add support to SynonymFilterFactory for custom + parsers. (Ryan Ernst via Robert Muir) + Bug Fixes * LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java index 6b61d7f3385..7afa491b0c0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java @@ -54,17 +54,16 @@ import org.apache.lucene.util.CharsRef; * * @lucene.experimental */ -public class SolrSynonymParser extends SynonymMap.Builder { +public class SolrSynonymParser extends SynonymMap.Parser { private final boolean expand; - private final Analyzer analyzer; public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { - super(dedup); + super(dedup, analyzer); this.expand = expand; - this.analyzer = analyzer; } - - public void add(Reader in) throws IOException, ParseException { + + @Override + public void parse(Reader in) throws IOException, ParseException { LineNumberReader br = new LineNumberReader(in); try { addInternal(br); @@ -96,19 +95,19 @@ public class SolrSynonymParser extends SynonymMap.Builder { String inputStrings[] = split(sides[0], ","); inputs = new CharsRef[inputStrings.length]; for (int i = 0; i < inputs.length; i++) { - inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef()); + inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRef()); } String outputStrings[] = split(sides[1], ","); outputs = new CharsRef[outputStrings.length]; for (int i = 0; i < outputs.length; i++) { - outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef()); + outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRef()); } } else { String inputStrings[] = split(line, ","); inputs = new CharsRef[inputStrings.length]; for (int i = 0; i < inputs.length; i++) { - inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef()); + inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRef()); } if (expand) { outputs = inputs; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java index 0344db4da2e..6aa504b976a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java @@ -61,6 +61,20 @@ import org.apache.lucene.util.Version; * the same name as an init param used by the SynonymFilterFactory, the prefix * is mandatory. *

+ * + *

+ * The optional {@code format} parameter controls how the synonyms will be parsed: + * It supports the short names of {@code solr} for {@link SolrSynonymParser} + * and {@code wordnet} for and {@link WordnetSynonymParser}, or your own + * {@code SynonymMap.Parser} class name. The default is {@code solr}. + * A custom {@link SynonymMap.Parser} is expected to have a constructor taking: + *

+ *

+ * @see SolrSynonymParser SolrSynonymParser: default format */ public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { private final boolean ignoreCase; @@ -127,61 +141,44 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource } try { + String formatClass = format; if (format == null || format.equals("solr")) { - // TODO: expose dedup as a parameter? - map = loadSolrSynonyms(loader, true, analyzer); + formatClass = SolrSynonymParser.class.getName(); } else if (format.equals("wordnet")) { - map = loadWordnetSynonyms(loader, true, analyzer); - } else { - // TODO: somehow make this more pluggable - throw new IllegalArgumentException("Unrecognized synonyms format: " + format); + formatClass = WordnetSynonymParser.class.getName(); } + // TODO: expose dedup as a parameter? + map = loadSynonyms(loader, formatClass, true, analyzer); } catch (ParseException e) { throw new IOException("Error parsing synonyms file:", e); } } - + /** - * Load synonyms from the solr format, "format=solr". + * Load synonyms with the given {@link SynonymMap.Parser} class. */ - private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { + private SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException { CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - - SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer); - File synonymFile = new File(synonyms); - if (synonymFile.exists()) { - decoder.reset(); - parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); - } else { - List files = splitFileNames(synonyms); - for (String file : files) { - decoder.reset(); - parser.add(new InputStreamReader(loader.openResource(file), decoder)); - } + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + SynonymMap.Parser parser; + Class clazz = loader.findClass(cname, SynonymMap.Parser.class); + try { + parser = clazz.getConstructor(boolean.class, boolean.class, Analyzer.class).newInstance(dedup, expand, analyzer); + } catch (Exception e) { + throw new RuntimeException(e); } - return parser.build(); - } - - /** - * Load synonyms from the wordnet format, "format=wordnet". - */ - private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { - CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - - WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer); + File synonymFile = new File(synonyms); if (synonymFile.exists()) { decoder.reset(); - parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); + parser.parse(new InputStreamReader(loader.openResource(synonyms), decoder)); } else { List files = splitFileNames(synonyms); for (String file : files) { decoder.reset(); - parser.add(new InputStreamReader(loader.openResource(file), decoder)); + parser.parse(new InputStreamReader(loader.openResource(file), decoder)); } } return parser.build(); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java index a30463b4762..e5b05c3c2d7 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java @@ -18,6 +18,8 @@ package org.apache.lucene.analysis.synonym; */ import java.io.IOException; +import java.io.Reader; +import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -107,39 +109,7 @@ public class SynonymMap { return reuse; } - /** Sugar: analyzes the text with the analyzer and - * separates by {@link SynonymMap#WORD_SEPARATOR}. - * reuse and its chars must not be null. */ - public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { - TokenStream ts = analyzer.tokenStream("", text); - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); - ts.reset(); - reuse.length = 0; - while (ts.incrementToken()) { - int length = termAtt.length(); - if (length == 0) { - throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); - } - if (posIncAtt.getPositionIncrement() != 1) { - throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1"); - } - reuse.grow(reuse.length + length + 1); /* current + word + separator */ - int end = reuse.offset + reuse.length; - if (reuse.length > 0) { - reuse.chars[end++] = SynonymMap.WORD_SEPARATOR; - reuse.length++; - } - System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); - reuse.length += length; - } - ts.end(); - ts.close(); - if (reuse.length == 0) { - throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); - } - return reuse; - } + /** only used for asserting! */ private boolean hasHoles(CharsRef chars) { @@ -312,4 +282,60 @@ public class SynonymMap { return new SynonymMap(fst, words, maxHorizontalContext); } } + + /** + * Abstraction for parsing synonym files. + * + * @lucene.experimental + */ + public static abstract class Parser extends Builder { + + private final Analyzer analyzer; + + public Parser(boolean dedup, Analyzer analyzer) { + super(dedup); + this.analyzer = analyzer; + } + + /** + * Parse the given input, adding synonyms to the inherited {@link Builder}. + * @param in The input to parse + */ + public abstract void parse(Reader in) throws IOException, ParseException; + + /** Sugar: analyzes the text with the analyzer and + * separates by {@link SynonymMap#WORD_SEPARATOR}. + * reuse and its chars must not be null. */ + public CharsRef analyze(String text, CharsRef reuse) throws IOException { + TokenStream ts = analyzer.tokenStream("", text); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + ts.reset(); + reuse.length = 0; + while (ts.incrementToken()) { + int length = termAtt.length(); + if (length == 0) { + throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); + } + if (posIncAtt.getPositionIncrement() != 1) { + throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1"); + } + reuse.grow(reuse.length + length + 1); /* current + word + separator */ + int end = reuse.offset + reuse.length; + if (reuse.length > 0) { + reuse.chars[end++] = SynonymMap.WORD_SEPARATOR; + reuse.length++; + } + System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); + reuse.length += length; + } + ts.end(); + ts.close(); + if (reuse.length == 0) { + throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); + } + return reuse; + } + } + } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java index db7d3540e02..f4421bf5cea 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java @@ -32,17 +32,16 @@ import org.apache.lucene.util.CharsRef; * @lucene.experimental */ // TODO: allow you to specify syntactic categories (e.g. just nouns, etc) -public class WordnetSynonymParser extends SynonymMap.Builder { +public class WordnetSynonymParser extends SynonymMap.Parser { private final boolean expand; - private final Analyzer analyzer; public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { - super(dedup); + super(dedup, analyzer); this.expand = expand; - this.analyzer = analyzer; } - - public void add(Reader in) throws IOException, ParseException { + + @Override + public void parse(Reader in) throws IOException, ParseException { LineNumberReader br = new LineNumberReader(in); try { String line = null; @@ -89,7 +88,7 @@ public class WordnetSynonymParser extends SynonymMap.Builder { int end = line.lastIndexOf('\''); String text = line.substring(start, end).replace("''", "'"); - return analyze(analyzer, text, reuse); + return analyze(text, reuse); } private void addInternal(CharsRef synset[], int size) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java index d6fe71f18cf..197c58959cc 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java @@ -44,7 +44,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase { "this test, that testing"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random())); - parser.add(new StringReader(testFile)); + parser.parse(new StringReader(testFile)); final SynonymMap map = parser.build(); Analyzer analyzer = new Analyzer() { @@ -77,7 +77,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase { public void testInvalidDoubleMap() throws Exception { String testFile = "a => b => c"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random())); - parser.add(new StringReader(testFile)); + parser.parse(new StringReader(testFile)); } /** parse a syn file with bad syntax */ @@ -85,7 +85,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase { public void testInvalidAnalyzesToNothingOutput() throws Exception { String testFile = "a => 1"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.SIMPLE, false)); - parser.add(new StringReader(testFile)); + parser.parse(new StringReader(testFile)); } /** parse a syn file with bad syntax */ @@ -93,7 +93,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase { public void testInvalidAnalyzesToNothingInput() throws Exception { String testFile = "1 => a"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.SIMPLE, false)); - parser.add(new StringReader(testFile)); + parser.parse(new StringReader(testFile)); } /** parse a syn file with bad syntax */ @@ -101,7 +101,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase { public void testInvalidPositionsInput() throws Exception { String testFile = "testola => the test"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT)); - parser.add(new StringReader(testFile)); + parser.parse(new StringReader(testFile)); } /** parse a syn file with bad syntax */ @@ -109,7 +109,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase { public void testInvalidPositionsOutput() throws Exception { String testFile = "the test => testola"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT)); - parser.add(new StringReader(testFile)); + parser.parse(new StringReader(testFile)); } /** parse a syn file with some escaped syntax chars */ @@ -118,7 +118,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase { "a\\=>a => b\\=>b\n" + "a\\,a => b\\,b"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); - parser.add(new StringReader(testFile)); + parser.parse(new StringReader(testFile)); final SynonymMap map = parser.build(); Analyzer analyzer = new Analyzer() { @Override diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java index 88ed6bbe7b8..84f29d2c5ce 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java @@ -32,16 +32,33 @@ import org.apache.lucene.analysis.util.StringMockResourceLoader; import org.apache.lucene.analysis.cjk.CJKAnalyzer; public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase { - /** test that we can parse and use the solr syn file */ - public void testSynonyms() throws Exception { + + /** checks for synonyms of "GB" in synonyms.txt */ + private void checkSolrSynonyms(TokenFilterFactory factory) throws Exception { Reader reader = new StringReader("GB"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - stream = tokenFilterFactory("Synonym", "synonyms", "synonyms.txt").create(stream); + stream = factory.create(stream); assertTrue(stream instanceof SynonymFilter); - assertTokenStreamContents(stream, + assertTokenStreamContents(stream, new String[] { "GB", "gib", "gigabyte", "gigabytes" }, new int[] { 1, 0, 0, 0 }); } + + /** checks for synonyms of "second" in synonyms-wordnet.txt */ + private void checkWordnetSynonyms(TokenFilterFactory factory) throws Exception { + Reader reader = new StringReader("second"); + TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + stream = factory.create(stream); + assertTrue(stream instanceof SynonymFilter); + assertTokenStreamContents(stream, + new String[] { "second", "2nd", "two" }, + new int[] { 1, 0, 0 }); + } + + /** test that we can parse and use the solr syn file */ + public void testSynonyms() throws Exception { + checkSolrSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms.txt")); + } /** if the synonyms are completely empty, test that we still analyze correctly */ public void testEmptySynonyms() throws Exception { @@ -52,6 +69,14 @@ public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase { "synonyms", "synonyms.txt").create(stream); assertTokenStreamContents(stream, new String[] { "GB" }); } + + public void testFormat() throws Exception { + checkSolrSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms.txt", "format", "solr")); + checkWordnetSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms-wordnet.txt", "format", "wordnet")); + // explicit class should work the same as the "solr" alias + checkSolrSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms.txt", + "format", SolrSynonymParser.class.getName())); + } /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { @@ -133,6 +158,8 @@ public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase { // :NOOP: } } + + } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java index 89146d88640..dfe7caea500 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java @@ -624,7 +624,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { "bbb => bbbb1 bbbb2\n"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random())); - parser.add(new StringReader(testFile)); + parser.parse(new StringReader(testFile)); final SynonymMap map = parser.build(); Analyzer analyzer = new Analyzer() { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java index eac1a678957..10488a40cee 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java @@ -27,7 +27,6 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; public class TestWordnetSynonymParser extends BaseTokenStreamTestCase { - Analyzer analyzer; String synonymsFile = "s(100000001,1,'woods',n,1,0).\n" + @@ -42,7 +41,7 @@ public class TestWordnetSynonymParser extends BaseTokenStreamTestCase { public void testSynonyms() throws Exception { WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random())); - parser.add(new StringReader(synonymsFile)); + parser.parse(new StringReader(synonymsFile)); final SynonymMap map = parser.build(); Analyzer analyzer = new Analyzer() { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms-wordnet.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms-wordnet.txt new file mode 100644 index 00000000000..6ecd06bdbd8 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms-wordnet.txt @@ -0,0 +1,3 @@ +s(100000001,1,'second',n,1,0). +s(100000001,2,'2nd',n,1,0). +s(100000001,3,'two',n,1,0).