diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f149c45443f..ffb94a0fea0 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -66,6 +66,9 @@ New Features
for example if a StopFilter had removed the last token. (Mike
McCandless)
+* LUCENE-5219: Add support to SynonymFilterFactory for custom
+ parsers. (Ryan Ernst via Robert Muir)
+
Bug Fixes
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java
index 6b61d7f3385..7afa491b0c0 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java
@@ -54,17 +54,16 @@ import org.apache.lucene.util.CharsRef;
*
* @lucene.experimental
*/
-public class SolrSynonymParser extends SynonymMap.Builder {
+public class SolrSynonymParser extends SynonymMap.Parser {
private final boolean expand;
- private final Analyzer analyzer;
public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
- super(dedup);
+ super(dedup, analyzer);
this.expand = expand;
- this.analyzer = analyzer;
}
-
- public void add(Reader in) throws IOException, ParseException {
+
+ @Override
+ public void parse(Reader in) throws IOException, ParseException {
LineNumberReader br = new LineNumberReader(in);
try {
addInternal(br);
@@ -96,19 +95,19 @@ public class SolrSynonymParser extends SynonymMap.Builder {
String inputStrings[] = split(sides[0], ",");
inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
- inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
+ inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRef());
}
String outputStrings[] = split(sides[1], ",");
outputs = new CharsRef[outputStrings.length];
for (int i = 0; i < outputs.length; i++) {
- outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
+ outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRef());
}
} else {
String inputStrings[] = split(line, ",");
inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
- inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
+ inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRef());
}
if (expand) {
outputs = inputs;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
index 0344db4da2e..6aa504b976a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
@@ -61,6 +61,20 @@ import org.apache.lucene.util.Version;
* the same name as an init param used by the SynonymFilterFactory, the prefix
* is mandatory.
*
+ *
+ *
+ * The optional {@code format} parameter controls how the synonyms will be parsed:
+ * It supports the short names of {@code solr} for {@link SolrSynonymParser}
+ * and {@code wordnet} for and {@link WordnetSynonymParser}, or your own
+ * {@code SynonymMap.Parser} class name. The default is {@code solr}.
+ * A custom {@link SynonymMap.Parser} is expected to have a constructor taking:
+ *
+ * boolean dedup
- true if duplicates should be ignored, false otherwise
+ * boolean expand
- true if conflation groups should be expanded, false if they are one-directional
+ * {@link Analyzer} analyzer
- an analyzer used for each raw synonym
+ *
+ *
+ * @see SolrSynonymParser SolrSynonymParser: default format
*/
public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
private final boolean ignoreCase;
@@ -127,61 +141,44 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
}
try {
+ String formatClass = format;
if (format == null || format.equals("solr")) {
- // TODO: expose dedup as a parameter?
- map = loadSolrSynonyms(loader, true, analyzer);
+ formatClass = SolrSynonymParser.class.getName();
} else if (format.equals("wordnet")) {
- map = loadWordnetSynonyms(loader, true, analyzer);
- } else {
- // TODO: somehow make this more pluggable
- throw new IllegalArgumentException("Unrecognized synonyms format: " + format);
+ formatClass = WordnetSynonymParser.class.getName();
}
+ // TODO: expose dedup as a parameter?
+ map = loadSynonyms(loader, formatClass, true, analyzer);
} catch (ParseException e) {
throw new IOException("Error parsing synonyms file:", e);
}
}
-
+
/**
- * Load synonyms from the solr format, "format=solr".
+ * Load synonyms with the given {@link SynonymMap.Parser} class.
*/
- private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
+ private SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
- .onMalformedInput(CodingErrorAction.REPORT)
- .onUnmappableCharacter(CodingErrorAction.REPORT);
-
- SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer);
- File synonymFile = new File(synonyms);
- if (synonymFile.exists()) {
- decoder.reset();
- parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
- } else {
- List files = splitFileNames(synonyms);
- for (String file : files) {
- decoder.reset();
- parser.add(new InputStreamReader(loader.openResource(file), decoder));
- }
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+
+ SynonymMap.Parser parser;
+ Class extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class);
+ try {
+ parser = clazz.getConstructor(boolean.class, boolean.class, Analyzer.class).newInstance(dedup, expand, analyzer);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
}
- return parser.build();
- }
-
- /**
- * Load synonyms from the wordnet format, "format=wordnet".
- */
- private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
- CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
- .onMalformedInput(CodingErrorAction.REPORT)
- .onUnmappableCharacter(CodingErrorAction.REPORT);
-
- WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer);
+
File synonymFile = new File(synonyms);
if (synonymFile.exists()) {
decoder.reset();
- parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
+ parser.parse(new InputStreamReader(loader.openResource(synonyms), decoder));
} else {
List files = splitFileNames(synonyms);
for (String file : files) {
decoder.reset();
- parser.add(new InputStreamReader(loader.openResource(file), decoder));
+ parser.parse(new InputStreamReader(loader.openResource(file), decoder));
}
}
return parser.build();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
index a30463b4762..e5b05c3c2d7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
@@ -18,6 +18,8 @@ package org.apache.lucene.analysis.synonym;
*/
import java.io.IOException;
+import java.io.Reader;
+import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -107,39 +109,7 @@ public class SynonymMap {
return reuse;
}
- /** Sugar: analyzes the text with the analyzer and
- * separates by {@link SynonymMap#WORD_SEPARATOR}.
- * reuse and its chars must not be null. */
- public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
- TokenStream ts = analyzer.tokenStream("", text);
- CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
- PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
- ts.reset();
- reuse.length = 0;
- while (ts.incrementToken()) {
- int length = termAtt.length();
- if (length == 0) {
- throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
- }
- if (posIncAtt.getPositionIncrement() != 1) {
- throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
- }
- reuse.grow(reuse.length + length + 1); /* current + word + separator */
- int end = reuse.offset + reuse.length;
- if (reuse.length > 0) {
- reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
- reuse.length++;
- }
- System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
- reuse.length += length;
- }
- ts.end();
- ts.close();
- if (reuse.length == 0) {
- throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
- }
- return reuse;
- }
+
/** only used for asserting! */
private boolean hasHoles(CharsRef chars) {
@@ -312,4 +282,60 @@ public class SynonymMap {
return new SynonymMap(fst, words, maxHorizontalContext);
}
}
+
+ /**
+ * Abstraction for parsing synonym files.
+ *
+ * @lucene.experimental
+ */
+ public static abstract class Parser extends Builder {
+
+ private final Analyzer analyzer;
+
+ public Parser(boolean dedup, Analyzer analyzer) {
+ super(dedup);
+ this.analyzer = analyzer;
+ }
+
+ /**
+ * Parse the given input, adding synonyms to the inherited {@link Builder}.
+ * @param in The input to parse
+ */
+ public abstract void parse(Reader in) throws IOException, ParseException;
+
+ /** Sugar: analyzes the text with the analyzer and
+ * separates by {@link SynonymMap#WORD_SEPARATOR}.
+ * reuse and its chars must not be null. */
+ public CharsRef analyze(String text, CharsRef reuse) throws IOException {
+ TokenStream ts = analyzer.tokenStream("", text);
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
+ ts.reset();
+ reuse.length = 0;
+ while (ts.incrementToken()) {
+ int length = termAtt.length();
+ if (length == 0) {
+ throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
+ }
+ if (posIncAtt.getPositionIncrement() != 1) {
+ throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
+ }
+ reuse.grow(reuse.length + length + 1); /* current + word + separator */
+ int end = reuse.offset + reuse.length;
+ if (reuse.length > 0) {
+ reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
+ reuse.length++;
+ }
+ System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
+ reuse.length += length;
+ }
+ ts.end();
+ ts.close();
+ if (reuse.length == 0) {
+ throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
+ }
+ return reuse;
+ }
+ }
+
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java
index db7d3540e02..f4421bf5cea 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java
@@ -32,17 +32,16 @@ import org.apache.lucene.util.CharsRef;
* @lucene.experimental
*/
// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
-public class WordnetSynonymParser extends SynonymMap.Builder {
+public class WordnetSynonymParser extends SynonymMap.Parser {
private final boolean expand;
- private final Analyzer analyzer;
public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
- super(dedup);
+ super(dedup, analyzer);
this.expand = expand;
- this.analyzer = analyzer;
}
-
- public void add(Reader in) throws IOException, ParseException {
+
+ @Override
+ public void parse(Reader in) throws IOException, ParseException {
LineNumberReader br = new LineNumberReader(in);
try {
String line = null;
@@ -89,7 +88,7 @@ public class WordnetSynonymParser extends SynonymMap.Builder {
int end = line.lastIndexOf('\'');
String text = line.substring(start, end).replace("''", "'");
- return analyze(analyzer, text, reuse);
+ return analyze(text, reuse);
}
private void addInternal(CharsRef synset[], int size) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java
index d6fe71f18cf..197c58959cc 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java
@@ -44,7 +44,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
"this test, that testing";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
- parser.add(new StringReader(testFile));
+ parser.parse(new StringReader(testFile));
final SynonymMap map = parser.build();
Analyzer analyzer = new Analyzer() {
@@ -77,7 +77,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
public void testInvalidDoubleMap() throws Exception {
String testFile = "a => b => c";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
- parser.add(new StringReader(testFile));
+ parser.parse(new StringReader(testFile));
}
/** parse a syn file with bad syntax */
@@ -85,7 +85,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
public void testInvalidAnalyzesToNothingOutput() throws Exception {
String testFile = "a => 1";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.SIMPLE, false));
- parser.add(new StringReader(testFile));
+ parser.parse(new StringReader(testFile));
}
/** parse a syn file with bad syntax */
@@ -93,7 +93,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
public void testInvalidAnalyzesToNothingInput() throws Exception {
String testFile = "1 => a";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.SIMPLE, false));
- parser.add(new StringReader(testFile));
+ parser.parse(new StringReader(testFile));
}
/** parse a syn file with bad syntax */
@@ -101,7 +101,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
public void testInvalidPositionsInput() throws Exception {
String testFile = "testola => the test";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
- parser.add(new StringReader(testFile));
+ parser.parse(new StringReader(testFile));
}
/** parse a syn file with bad syntax */
@@ -109,7 +109,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
public void testInvalidPositionsOutput() throws Exception {
String testFile = "the test => testola";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
- parser.add(new StringReader(testFile));
+ parser.parse(new StringReader(testFile));
}
/** parse a syn file with some escaped syntax chars */
@@ -118,7 +118,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
"a\\=>a => b\\=>b\n" +
"a\\,a => b\\,b";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
- parser.add(new StringReader(testFile));
+ parser.parse(new StringReader(testFile));
final SynonymMap map = parser.build();
Analyzer analyzer = new Analyzer() {
@Override
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java
index 88ed6bbe7b8..84f29d2c5ce 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java
@@ -32,16 +32,33 @@ import org.apache.lucene.analysis.util.StringMockResourceLoader;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
- /** test that we can parse and use the solr syn file */
- public void testSynonyms() throws Exception {
+
+ /** checks for synonyms of "GB" in synonyms.txt */
+ private void checkSolrSynonyms(TokenFilterFactory factory) throws Exception {
Reader reader = new StringReader("GB");
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- stream = tokenFilterFactory("Synonym", "synonyms", "synonyms.txt").create(stream);
+ stream = factory.create(stream);
assertTrue(stream instanceof SynonymFilter);
- assertTokenStreamContents(stream,
+ assertTokenStreamContents(stream,
new String[] { "GB", "gib", "gigabyte", "gigabytes" },
new int[] { 1, 0, 0, 0 });
}
+
+ /** checks for synonyms of "second" in synonyms-wordnet.txt */
+ private void checkWordnetSynonyms(TokenFilterFactory factory) throws Exception {
+ Reader reader = new StringReader("second");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = factory.create(stream);
+ assertTrue(stream instanceof SynonymFilter);
+ assertTokenStreamContents(stream,
+ new String[] { "second", "2nd", "two" },
+ new int[] { 1, 0, 0 });
+ }
+
+ /** test that we can parse and use the solr syn file */
+ public void testSynonyms() throws Exception {
+ checkSolrSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms.txt"));
+ }
/** if the synonyms are completely empty, test that we still analyze correctly */
public void testEmptySynonyms() throws Exception {
@@ -52,6 +69,14 @@ public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
"synonyms", "synonyms.txt").create(stream);
assertTokenStreamContents(stream, new String[] { "GB" });
}
+
+ public void testFormat() throws Exception {
+ checkSolrSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms.txt", "format", "solr"));
+ checkWordnetSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms-wordnet.txt", "format", "wordnet"));
+ // explicit class should work the same as the "solr" alias
+ checkSolrSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms.txt",
+ "format", SolrSynonymParser.class.getName()));
+ }
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
@@ -133,6 +158,8 @@ public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
// :NOOP:
}
}
+
+
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
index 89146d88640..dfe7caea500 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
@@ -624,7 +624,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
"bbb => bbbb1 bbbb2\n";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
- parser.add(new StringReader(testFile));
+ parser.parse(new StringReader(testFile));
final SynonymMap map = parser.build();
Analyzer analyzer = new Analyzer() {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java
index eac1a678957..10488a40cee 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java
@@ -27,7 +27,6 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
- Analyzer analyzer;
String synonymsFile =
"s(100000001,1,'woods',n,1,0).\n" +
@@ -42,7 +41,7 @@ public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
public void testSynonyms() throws Exception {
WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random()));
- parser.add(new StringReader(synonymsFile));
+ parser.parse(new StringReader(synonymsFile));
final SynonymMap map = parser.build();
Analyzer analyzer = new Analyzer() {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms-wordnet.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms-wordnet.txt
new file mode 100644
index 00000000000..6ecd06bdbd8
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms-wordnet.txt
@@ -0,0 +1,3 @@
+s(100000001,1,'second',n,1,0).
+s(100000001,2,'2nd',n,1,0).
+s(100000001,3,'two',n,1,0).