mirror of https://github.com/apache/lucene.git
LUCENE-5219: Add support to SynonymFilterFactory for custom parsers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1524839 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b7544e16ba
commit
9d16038391
|
@ -66,6 +66,9 @@ New Features
|
||||||
for example if a StopFilter had removed the last token. (Mike
|
for example if a StopFilter had removed the last token. (Mike
|
||||||
McCandless)
|
McCandless)
|
||||||
|
|
||||||
|
* LUCENE-5219: Add support to SynonymFilterFactory for custom
|
||||||
|
parsers. (Ryan Ernst via Robert Muir)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
|
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
|
||||||
|
|
|
@ -54,17 +54,16 @@ import org.apache.lucene.util.CharsRef;
|
||||||
* </ol>
|
* </ol>
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class SolrSynonymParser extends SynonymMap.Builder {
|
public class SolrSynonymParser extends SynonymMap.Parser {
|
||||||
private final boolean expand;
|
private final boolean expand;
|
||||||
private final Analyzer analyzer;
|
|
||||||
|
|
||||||
public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
|
public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
|
||||||
super(dedup);
|
super(dedup, analyzer);
|
||||||
this.expand = expand;
|
this.expand = expand;
|
||||||
this.analyzer = analyzer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(Reader in) throws IOException, ParseException {
|
@Override
|
||||||
|
public void parse(Reader in) throws IOException, ParseException {
|
||||||
LineNumberReader br = new LineNumberReader(in);
|
LineNumberReader br = new LineNumberReader(in);
|
||||||
try {
|
try {
|
||||||
addInternal(br);
|
addInternal(br);
|
||||||
|
@ -96,19 +95,19 @@ public class SolrSynonymParser extends SynonymMap.Builder {
|
||||||
String inputStrings[] = split(sides[0], ",");
|
String inputStrings[] = split(sides[0], ",");
|
||||||
inputs = new CharsRef[inputStrings.length];
|
inputs = new CharsRef[inputStrings.length];
|
||||||
for (int i = 0; i < inputs.length; i++) {
|
for (int i = 0; i < inputs.length; i++) {
|
||||||
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
|
inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRef());
|
||||||
}
|
}
|
||||||
|
|
||||||
String outputStrings[] = split(sides[1], ",");
|
String outputStrings[] = split(sides[1], ",");
|
||||||
outputs = new CharsRef[outputStrings.length];
|
outputs = new CharsRef[outputStrings.length];
|
||||||
for (int i = 0; i < outputs.length; i++) {
|
for (int i = 0; i < outputs.length; i++) {
|
||||||
outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
|
outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRef());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
String inputStrings[] = split(line, ",");
|
String inputStrings[] = split(line, ",");
|
||||||
inputs = new CharsRef[inputStrings.length];
|
inputs = new CharsRef[inputStrings.length];
|
||||||
for (int i = 0; i < inputs.length; i++) {
|
for (int i = 0; i < inputs.length; i++) {
|
||||||
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
|
inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRef());
|
||||||
}
|
}
|
||||||
if (expand) {
|
if (expand) {
|
||||||
outputs = inputs;
|
outputs = inputs;
|
||||||
|
|
|
@ -61,6 +61,20 @@ import org.apache.lucene.util.Version;
|
||||||
* the same name as an init param used by the SynonymFilterFactory, the prefix
|
* the same name as an init param used by the SynonymFilterFactory, the prefix
|
||||||
* is mandatory.
|
* is mandatory.
|
||||||
* </p>
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The optional {@code format} parameter controls how the synonyms will be parsed:
|
||||||
|
* It supports the short names of {@code solr} for {@link SolrSynonymParser}
|
||||||
|
* and {@code wordnet} for and {@link WordnetSynonymParser}, or your own
|
||||||
|
* {@code SynonymMap.Parser} class name. The default is {@code solr}.
|
||||||
|
* A custom {@link SynonymMap.Parser} is expected to have a constructor taking:
|
||||||
|
* <ul>
|
||||||
|
* <li><code>boolean dedup</code> - true if duplicates should be ignored, false otherwise</li>
|
||||||
|
* <li><code>boolean expand</code> - true if conflation groups should be expanded, false if they are one-directional</li>
|
||||||
|
* <li><code>{@link Analyzer} analyzer</code> - an analyzer used for each raw synonym</li>
|
||||||
|
* </ul>
|
||||||
|
* </p>
|
||||||
|
* @see SolrSynonymParser SolrSynonymParser: default format
|
||||||
*/
|
*/
|
||||||
public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||||
private final boolean ignoreCase;
|
private final boolean ignoreCase;
|
||||||
|
@ -127,61 +141,44 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
String formatClass = format;
|
||||||
if (format == null || format.equals("solr")) {
|
if (format == null || format.equals("solr")) {
|
||||||
// TODO: expose dedup as a parameter?
|
formatClass = SolrSynonymParser.class.getName();
|
||||||
map = loadSolrSynonyms(loader, true, analyzer);
|
|
||||||
} else if (format.equals("wordnet")) {
|
} else if (format.equals("wordnet")) {
|
||||||
map = loadWordnetSynonyms(loader, true, analyzer);
|
formatClass = WordnetSynonymParser.class.getName();
|
||||||
} else {
|
|
||||||
// TODO: somehow make this more pluggable
|
|
||||||
throw new IllegalArgumentException("Unrecognized synonyms format: " + format);
|
|
||||||
}
|
}
|
||||||
|
// TODO: expose dedup as a parameter?
|
||||||
|
map = loadSynonyms(loader, formatClass, true, analyzer);
|
||||||
} catch (ParseException e) {
|
} catch (ParseException e) {
|
||||||
throw new IOException("Error parsing synonyms file:", e);
|
throw new IOException("Error parsing synonyms file:", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Load synonyms from the solr format, "format=solr".
|
* Load synonyms with the given {@link SynonymMap.Parser} class.
|
||||||
*/
|
*/
|
||||||
private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
|
private SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
|
||||||
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
|
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
|
||||||
.onMalformedInput(CodingErrorAction.REPORT)
|
.onMalformedInput(CodingErrorAction.REPORT)
|
||||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||||
|
|
||||||
SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer);
|
SynonymMap.Parser parser;
|
||||||
File synonymFile = new File(synonyms);
|
Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class);
|
||||||
if (synonymFile.exists()) {
|
try {
|
||||||
decoder.reset();
|
parser = clazz.getConstructor(boolean.class, boolean.class, Analyzer.class).newInstance(dedup, expand, analyzer);
|
||||||
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
|
} catch (Exception e) {
|
||||||
} else {
|
throw new RuntimeException(e);
|
||||||
List<String> files = splitFileNames(synonyms);
|
|
||||||
for (String file : files) {
|
|
||||||
decoder.reset();
|
|
||||||
parser.add(new InputStreamReader(loader.openResource(file), decoder));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return parser.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load synonyms from the wordnet format, "format=wordnet".
|
|
||||||
*/
|
|
||||||
private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
|
|
||||||
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
|
|
||||||
.onMalformedInput(CodingErrorAction.REPORT)
|
|
||||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
|
||||||
|
|
||||||
WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer);
|
|
||||||
File synonymFile = new File(synonyms);
|
File synonymFile = new File(synonyms);
|
||||||
if (synonymFile.exists()) {
|
if (synonymFile.exists()) {
|
||||||
decoder.reset();
|
decoder.reset();
|
||||||
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
|
parser.parse(new InputStreamReader(loader.openResource(synonyms), decoder));
|
||||||
} else {
|
} else {
|
||||||
List<String> files = splitFileNames(synonyms);
|
List<String> files = splitFileNames(synonyms);
|
||||||
for (String file : files) {
|
for (String file : files) {
|
||||||
decoder.reset();
|
decoder.reset();
|
||||||
parser.add(new InputStreamReader(loader.openResource(file), decoder));
|
parser.parse(new InputStreamReader(loader.openResource(file), decoder));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return parser.build();
|
return parser.build();
|
||||||
|
|
|
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.synonym;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.text.ParseException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
@ -107,39 +109,7 @@ public class SynonymMap {
|
||||||
return reuse;
|
return reuse;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Sugar: analyzes the text with the analyzer and
|
|
||||||
* separates by {@link SynonymMap#WORD_SEPARATOR}.
|
|
||||||
* reuse and its chars must not be null. */
|
|
||||||
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
|
|
||||||
TokenStream ts = analyzer.tokenStream("", text);
|
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
|
||||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
|
||||||
ts.reset();
|
|
||||||
reuse.length = 0;
|
|
||||||
while (ts.incrementToken()) {
|
|
||||||
int length = termAtt.length();
|
|
||||||
if (length == 0) {
|
|
||||||
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
|
|
||||||
}
|
|
||||||
if (posIncAtt.getPositionIncrement() != 1) {
|
|
||||||
throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
|
|
||||||
}
|
|
||||||
reuse.grow(reuse.length + length + 1); /* current + word + separator */
|
|
||||||
int end = reuse.offset + reuse.length;
|
|
||||||
if (reuse.length > 0) {
|
|
||||||
reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
|
|
||||||
reuse.length++;
|
|
||||||
}
|
|
||||||
System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
|
|
||||||
reuse.length += length;
|
|
||||||
}
|
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
if (reuse.length == 0) {
|
|
||||||
throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
|
|
||||||
}
|
|
||||||
return reuse;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** only used for asserting! */
|
/** only used for asserting! */
|
||||||
private boolean hasHoles(CharsRef chars) {
|
private boolean hasHoles(CharsRef chars) {
|
||||||
|
@ -312,4 +282,60 @@ public class SynonymMap {
|
||||||
return new SynonymMap(fst, words, maxHorizontalContext);
|
return new SynonymMap(fst, words, maxHorizontalContext);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Abstraction for parsing synonym files.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public static abstract class Parser extends Builder {
|
||||||
|
|
||||||
|
private final Analyzer analyzer;
|
||||||
|
|
||||||
|
public Parser(boolean dedup, Analyzer analyzer) {
|
||||||
|
super(dedup);
|
||||||
|
this.analyzer = analyzer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse the given input, adding synonyms to the inherited {@link Builder}.
|
||||||
|
* @param in The input to parse
|
||||||
|
*/
|
||||||
|
public abstract void parse(Reader in) throws IOException, ParseException;
|
||||||
|
|
||||||
|
/** Sugar: analyzes the text with the analyzer and
|
||||||
|
* separates by {@link SynonymMap#WORD_SEPARATOR}.
|
||||||
|
* reuse and its chars must not be null. */
|
||||||
|
public CharsRef analyze(String text, CharsRef reuse) throws IOException {
|
||||||
|
TokenStream ts = analyzer.tokenStream("", text);
|
||||||
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
ts.reset();
|
||||||
|
reuse.length = 0;
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
int length = termAtt.length();
|
||||||
|
if (length == 0) {
|
||||||
|
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
|
||||||
|
}
|
||||||
|
if (posIncAtt.getPositionIncrement() != 1) {
|
||||||
|
throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
|
||||||
|
}
|
||||||
|
reuse.grow(reuse.length + length + 1); /* current + word + separator */
|
||||||
|
int end = reuse.offset + reuse.length;
|
||||||
|
if (reuse.length > 0) {
|
||||||
|
reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
|
||||||
|
reuse.length++;
|
||||||
|
}
|
||||||
|
System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
|
||||||
|
reuse.length += length;
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
|
ts.close();
|
||||||
|
if (reuse.length == 0) {
|
||||||
|
throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
|
||||||
|
}
|
||||||
|
return reuse;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,17 +32,16 @@ import org.apache.lucene.util.CharsRef;
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
|
// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
|
||||||
public class WordnetSynonymParser extends SynonymMap.Builder {
|
public class WordnetSynonymParser extends SynonymMap.Parser {
|
||||||
private final boolean expand;
|
private final boolean expand;
|
||||||
private final Analyzer analyzer;
|
|
||||||
|
|
||||||
public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
|
public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
|
||||||
super(dedup);
|
super(dedup, analyzer);
|
||||||
this.expand = expand;
|
this.expand = expand;
|
||||||
this.analyzer = analyzer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(Reader in) throws IOException, ParseException {
|
@Override
|
||||||
|
public void parse(Reader in) throws IOException, ParseException {
|
||||||
LineNumberReader br = new LineNumberReader(in);
|
LineNumberReader br = new LineNumberReader(in);
|
||||||
try {
|
try {
|
||||||
String line = null;
|
String line = null;
|
||||||
|
@ -89,7 +88,7 @@ public class WordnetSynonymParser extends SynonymMap.Builder {
|
||||||
int end = line.lastIndexOf('\'');
|
int end = line.lastIndexOf('\'');
|
||||||
|
|
||||||
String text = line.substring(start, end).replace("''", "'");
|
String text = line.substring(start, end).replace("''", "'");
|
||||||
return analyze(analyzer, text, reuse);
|
return analyze(text, reuse);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addInternal(CharsRef synset[], int size) {
|
private void addInternal(CharsRef synset[], int size) {
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
|
||||||
"this test, that testing";
|
"this test, that testing";
|
||||||
|
|
||||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
|
||||||
parser.add(new StringReader(testFile));
|
parser.parse(new StringReader(testFile));
|
||||||
final SynonymMap map = parser.build();
|
final SynonymMap map = parser.build();
|
||||||
|
|
||||||
Analyzer analyzer = new Analyzer() {
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
@ -77,7 +77,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
|
||||||
public void testInvalidDoubleMap() throws Exception {
|
public void testInvalidDoubleMap() throws Exception {
|
||||||
String testFile = "a => b => c";
|
String testFile = "a => b => c";
|
||||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
|
||||||
parser.add(new StringReader(testFile));
|
parser.parse(new StringReader(testFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** parse a syn file with bad syntax */
|
/** parse a syn file with bad syntax */
|
||||||
|
@ -85,7 +85,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
|
||||||
public void testInvalidAnalyzesToNothingOutput() throws Exception {
|
public void testInvalidAnalyzesToNothingOutput() throws Exception {
|
||||||
String testFile = "a => 1";
|
String testFile = "a => 1";
|
||||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.SIMPLE, false));
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.SIMPLE, false));
|
||||||
parser.add(new StringReader(testFile));
|
parser.parse(new StringReader(testFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** parse a syn file with bad syntax */
|
/** parse a syn file with bad syntax */
|
||||||
|
@ -93,7 +93,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
|
||||||
public void testInvalidAnalyzesToNothingInput() throws Exception {
|
public void testInvalidAnalyzesToNothingInput() throws Exception {
|
||||||
String testFile = "1 => a";
|
String testFile = "1 => a";
|
||||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.SIMPLE, false));
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.SIMPLE, false));
|
||||||
parser.add(new StringReader(testFile));
|
parser.parse(new StringReader(testFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** parse a syn file with bad syntax */
|
/** parse a syn file with bad syntax */
|
||||||
|
@ -101,7 +101,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
|
||||||
public void testInvalidPositionsInput() throws Exception {
|
public void testInvalidPositionsInput() throws Exception {
|
||||||
String testFile = "testola => the test";
|
String testFile = "testola => the test";
|
||||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
|
||||||
parser.add(new StringReader(testFile));
|
parser.parse(new StringReader(testFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** parse a syn file with bad syntax */
|
/** parse a syn file with bad syntax */
|
||||||
|
@ -109,7 +109,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
|
||||||
public void testInvalidPositionsOutput() throws Exception {
|
public void testInvalidPositionsOutput() throws Exception {
|
||||||
String testFile = "the test => testola";
|
String testFile = "the test => testola";
|
||||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
|
||||||
parser.add(new StringReader(testFile));
|
parser.parse(new StringReader(testFile));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** parse a syn file with some escaped syntax chars */
|
/** parse a syn file with some escaped syntax chars */
|
||||||
|
@ -118,7 +118,7 @@ public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
|
||||||
"a\\=>a => b\\=>b\n" +
|
"a\\=>a => b\\=>b\n" +
|
||||||
"a\\,a => b\\,b";
|
"a\\,a => b\\,b";
|
||||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
|
||||||
parser.add(new StringReader(testFile));
|
parser.parse(new StringReader(testFile));
|
||||||
final SynonymMap map = parser.build();
|
final SynonymMap map = parser.build();
|
||||||
Analyzer analyzer = new Analyzer() {
|
Analyzer analyzer = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -32,17 +32,34 @@ import org.apache.lucene.analysis.util.StringMockResourceLoader;
|
||||||
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
|
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
|
||||||
|
|
||||||
public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
|
public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
/** test that we can parse and use the solr syn file */
|
|
||||||
public void testSynonyms() throws Exception {
|
/** checks for synonyms of "GB" in synonyms.txt */
|
||||||
|
private void checkSolrSynonyms(TokenFilterFactory factory) throws Exception {
|
||||||
Reader reader = new StringReader("GB");
|
Reader reader = new StringReader("GB");
|
||||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
stream = tokenFilterFactory("Synonym", "synonyms", "synonyms.txt").create(stream);
|
stream = factory.create(stream);
|
||||||
assertTrue(stream instanceof SynonymFilter);
|
assertTrue(stream instanceof SynonymFilter);
|
||||||
assertTokenStreamContents(stream,
|
assertTokenStreamContents(stream,
|
||||||
new String[] { "GB", "gib", "gigabyte", "gigabytes" },
|
new String[] { "GB", "gib", "gigabyte", "gigabytes" },
|
||||||
new int[] { 1, 0, 0, 0 });
|
new int[] { 1, 0, 0, 0 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** checks for synonyms of "second" in synonyms-wordnet.txt */
|
||||||
|
private void checkWordnetSynonyms(TokenFilterFactory factory) throws Exception {
|
||||||
|
Reader reader = new StringReader("second");
|
||||||
|
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
stream = factory.create(stream);
|
||||||
|
assertTrue(stream instanceof SynonymFilter);
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "second", "2nd", "two" },
|
||||||
|
new int[] { 1, 0, 0 });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test that we can parse and use the solr syn file */
|
||||||
|
public void testSynonyms() throws Exception {
|
||||||
|
checkSolrSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms.txt"));
|
||||||
|
}
|
||||||
|
|
||||||
/** if the synonyms are completely empty, test that we still analyze correctly */
|
/** if the synonyms are completely empty, test that we still analyze correctly */
|
||||||
public void testEmptySynonyms() throws Exception {
|
public void testEmptySynonyms() throws Exception {
|
||||||
Reader reader = new StringReader("GB");
|
Reader reader = new StringReader("GB");
|
||||||
|
@ -53,6 +70,14 @@ public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
assertTokenStreamContents(stream, new String[] { "GB" });
|
assertTokenStreamContents(stream, new String[] { "GB" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testFormat() throws Exception {
|
||||||
|
checkSolrSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms.txt", "format", "solr"));
|
||||||
|
checkWordnetSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms-wordnet.txt", "format", "wordnet"));
|
||||||
|
// explicit class should work the same as the "solr" alias
|
||||||
|
checkSolrSynonyms(tokenFilterFactory("Synonym", "synonyms", "synonyms.txt",
|
||||||
|
"format", SolrSynonymParser.class.getName()));
|
||||||
|
}
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() throws Exception {
|
||||||
try {
|
try {
|
||||||
|
@ -133,6 +158,8 @@ public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
// :NOOP:
|
// :NOOP:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -624,7 +624,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
"bbb => bbbb1 bbbb2\n";
|
"bbb => bbbb1 bbbb2\n";
|
||||||
|
|
||||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
|
||||||
parser.add(new StringReader(testFile));
|
parser.parse(new StringReader(testFile));
|
||||||
final SynonymMap map = parser.build();
|
final SynonymMap map = parser.build();
|
||||||
|
|
||||||
Analyzer analyzer = new Analyzer() {
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
|
public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
|
||||||
Analyzer analyzer;
|
|
||||||
|
|
||||||
String synonymsFile =
|
String synonymsFile =
|
||||||
"s(100000001,1,'woods',n,1,0).\n" +
|
"s(100000001,1,'woods',n,1,0).\n" +
|
||||||
|
@ -42,7 +41,7 @@ public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testSynonyms() throws Exception {
|
public void testSynonyms() throws Exception {
|
||||||
WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random()));
|
WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random()));
|
||||||
parser.add(new StringReader(synonymsFile));
|
parser.parse(new StringReader(synonymsFile));
|
||||||
final SynonymMap map = parser.build();
|
final SynonymMap map = parser.build();
|
||||||
|
|
||||||
Analyzer analyzer = new Analyzer() {
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
s(100000001,1,'second',n,1,0).
|
||||||
|
s(100000001,2,'2nd',n,1,0).
|
||||||
|
s(100000001,3,'two',n,1,0).
|
Loading…
Reference in New Issue