From 3a7b25ba92864b334526d827f3aa5aa6445d9829 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Wed, 13 Nov 2019 16:40:46 +0100 Subject: [PATCH] LUCENE-9030: Fix different Solr- and WordnetSynonymParser behaviour (#981) This fixes an issue where sets of equivalent synonyms in the Wordnet format are parsed and added to the SynonymMap in a way that leads to the original input token not being typed as "word" but as SYNONYM instead. Also the original token doesn't appear first in the token stream output, which is the case for equivalent solr formatted synonym files. Currently the WordnetSynonymParser adds all combinations of input/output pairs of a synset entry into the synonym map, while the SolrSynonymParser excludes those where input and output term are the same. This change adds the same behaviour to WordnetSynonymParser and adds tests that show the two formats are outputting the same token order and types now. --- lucene/CHANGES.txt | 3 +++ .../synonym/WordnetSynonymParser.java | 4 +++- .../synonym/TestSolrSynonymParser.java | 23 +++++++++++++++++++ .../synonym/TestWordnetSynonymParser.java | 5 ++++ 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index fcbbd4e6f00..d1a6f996404 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -109,6 +109,9 @@ Bug Fixes * LUCENE-9001: Fix race condition in SetOnce. (Przemko Robakowski) +* LUCENE-9030: Fix WordnetSynonymParser behaviour so it behaves similar to + SolrSynonymParser. (Christoph Buescher via Alan Woodward) + Other * LUCENE-8979: Code Cleanup: Use entryset for map iteration wherever possible. - Part 2 (Koen De Groote) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java index 13e488c8f2d..aa83ac48adc 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java @@ -96,7 +96,9 @@ public class WordnetSynonymParser extends SynonymMap.Parser { if (expand) { for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { - add(synset[i], synset[j], false); + if (i != j) { + add(synset[i], synset[j], true); + } } } } else { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java index 9467137e4a5..748b177a25f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java @@ -185,6 +185,29 @@ public class TestSolrSynonymParser extends BaseSynonymParserTestCase { new int[]{1, 2, 1}); } + /** Verify type of original token is "word", others are Synonym. */ + public void testTypes() throws Exception { + String testFile = "woods, wood, forest"; + + Analyzer analyzer = new MockAnalyzer(random()); + SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); + parser.parse(new StringReader(testFile)); + final SynonymMap map = parser.build(); + analyzer.close(); + + analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(analyzer, "lost in the forest", + new String[]{"lost", "in", "the", "forest", "woods", "wood"}, + new String[]{"word", "word", "word", "word", "SYNONYM", "SYNONYM"}); + } + /** Test parsing of simple examples. */ public void testParseSimple() throws Exception { String testFile = diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java index 675c8291da6..2449f8fd242 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java @@ -66,6 +66,11 @@ public class TestWordnetSynonymParser extends BaseTokenStreamTestCase { /* multi words */ assertAnalyzesTo(analyzer, "king's evil", new String[] { "king's", "king's", "evil", "meany" }); + + /* all expansions, test types */ + assertAnalyzesTo(analyzer, "Lost in the forest", + new String[] { "Lost", "in", "the", "forest", "woods", "wood"}, + new String[] { "word", "word", "word", "word", "SYNONYM", "SYNONYM" }); analyzer.close(); } }