LUCENE-9030: Fix different Solr- and WordnetSynonymParser behaviour (#981)

This fixes an issue where sets of equivalent synonyms in the Wordnet format are
parsed and added to the SynonymMap in a way that leads to the original input
token not being typed as "word" but as SYNONYM instead. Also the original token
doesn't appear first in the token stream output, which is the case for
equivalent solr formatted synonym files.
Currently the WordnetSynonymParser adds all combinations of input/output pairs
of a synset entry into the synonym map, while the SolrSynonymParser excludes
those where input and output term are the same. This change adds the same
behaviour to WordnetSynonymParser and adds tests that show the two formats are
outputting the same token order and types now.
This commit is contained in:
Christoph Büscher 2019-11-13 16:40:46 +01:00 committed by Alan Woodward
parent 0c3233877b
commit 3a7b25ba92
4 changed files with 34 additions and 1 deletions

View File

@ -109,6 +109,9 @@ Bug Fixes
* LUCENE-9001: Fix race condition in SetOnce. (Przemko Robakowski)
* LUCENE-9030: Fix WordnetSynonymParser behaviour so it behaves similar to
SolrSynonymParser. (Christoph Buescher via Alan Woodward)
Other
* LUCENE-8979: Code Cleanup: Use entryset for map iteration wherever possible. - Part 2 (Koen De Groote)

View File

@ -96,7 +96,9 @@ public class WordnetSynonymParser extends SynonymMap.Parser {
if (expand) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
add(synset[i], synset[j], false);
if (i != j) {
add(synset[i], synset[j], true);
}
}
}
} else {

View File

@ -185,6 +185,29 @@ public class TestSolrSynonymParser extends BaseSynonymParserTestCase {
new int[]{1, 2, 1});
}
/** Verify type of original token is "word", others are Synonym. */
public void testTypes() throws Exception {
String testFile = "woods, wood, forest";
Analyzer analyzer = new MockAnalyzer(random());
SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
parser.parse(new StringReader(testFile));
final SynonymMap map = parser.build();
analyzer.close();
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
}
};
assertAnalyzesTo(analyzer, "lost in the forest",
new String[]{"lost", "in", "the", "forest", "woods", "wood"},
new String[]{"word", "word", "word", "word", "SYNONYM", "SYNONYM"});
}
/** Test parsing of simple examples. */
public void testParseSimple() throws Exception {
String testFile =

View File

@ -66,6 +66,11 @@ public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
/* multi words */
assertAnalyzesTo(analyzer, "king's evil",
new String[] { "king's", "king's", "evil", "meany" });
/* all expansions, test types */
assertAnalyzesTo(analyzer, "Lost in the forest",
new String[] { "Lost", "in", "the", "forest", "woods", "wood"},
new String[] { "word", "word", "word", "word", "SYNONYM", "SYNONYM" });
analyzer.close();
}
}