+ // Accum the output
+ pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+ //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
+ bufUpto += Character.charCount(codePoint);
+ }
+ // OK, entire token matched; now see if this is a final
+ // state:
+ if (scratchArc.isFinal()) {
+ matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
+ matchInputLength = tokenCount;
+ //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
+ }
+ // See if the FST wants to continue matching (ie, needs to
+ // see the next input token):
+ if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
+ // No further rules can match here; we're done
+ // searching for matching rules starting at the
+ // current input position.
+ break;
+ } else {
+ // More matching is possible -- accum the output (if
+ // any) of the WORD_SEP arc:
+ pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+ if (nextRead == nextWrite) {
+ capture();
+ }
+ }
+ curNextRead = rollIncr(curNextRead);
+ }
+ if (nextRead == nextWrite && !finished) {
+ //System.out.println(" skip write slot=" + nextWrite);
+ nextWrite = rollIncr(nextWrite);
+ }
+ if (matchOutput != null) {
+ //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
+ inputSkipCount = matchInputLength;
+ addOutput(matchOutput);
+ } else if (nextRead != nextWrite) {
+ // Even though we had no match here, we set to 1
+ // because we need to skip current input token before
+ // trying to match again:
+ inputSkipCount = 1;
+ } else {
+ assert finished;
+ }
+ //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
+ }
+ // Interleaves all output tokens onto the futureOutputs:
+ private void addOutput(BytesRef bytes) {
+ bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
+ final int code = bytesReader.readVInt();
+ final boolean keepOrig = (code & 0x1) == 0;
+ final int count = code >>> 1;
+ //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig);
+ for(int outputIDX=0;outputIDX ords = new ArrayList();
+ }
+ /** Sugar: just joins the provided terms with {@link
+ * SynonymMap#WORD_SEPARATOR}. reuse and its chars
+ * must not be null. */
+ public static CharsRef join(String[] words, CharsRef reuse) {
+ int upto = 0;
+ char[] buffer = reuse.chars;
+ for(String word : words) {
+ if (upto > 0) {
+ if (upto >= buffer.length) {
+ reuse.grow(upto);
+ buffer = reuse.chars;
+ }
+ buffer[upto++] = SynonymMap.WORD_SEPARATOR;
+ }
+ final int wordLen = word.length();
+ final int needed = upto + wordLen;
+ if (needed > buffer.length) {
+ reuse.grow(needed);
+ buffer = reuse.chars;
+ }
+ word.getChars(0, wordLen, buffer, upto);
+ upto += wordLen;
+ }
+ return reuse;
+ }
+ /** Sugar: analyzes the text with the analyzer and
+ * separates by {@link SynonymMap#WORD_SEPARATOR}.
+ * reuse and its chars must not be null. */
+ public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
+ TokenStream ts = analyzer.reusableTokenStream("", new StringReader(text));
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
+ ts.reset();
+ reuse.length = 0;
+ while (ts.incrementToken()) {
+ int length = termAtt.length();
+ if (length == 0) {
+ throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
+ }
+ if (posIncAtt.getPositionIncrement() != 1) {
+ throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
+ }
+ reuse.grow(reuse.length + length + 1); /* current + word + separator */
+ int end = reuse.offset + reuse.length;
+ if (reuse.length > 0) {
+ reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
+ reuse.length++;
+ }
+ System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
+ reuse.length += length;
+ }
+ ts.end();
+ ts.close();
+ if (reuse.length == 0) {
+ throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
+ }
+ return reuse;
+ }
+ /** only used for asserting! */
+ private boolean hasHoles(CharsRef chars) {
+ final int end = chars.offset + chars.length;
+ for(int idx=chars.offset+1;idx 0 (got " + numInputWords + ")");
+ }
+ if (input.length <= 0) {
+ throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")");
+ }
+ if (numOutputWords <= 0) {
+ throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
+ }
+ if (output.length <= 0) {
+ throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")");
+ }
+ assert !hasHoles(input): "input has holes: " + input;
+ assert !hasHoles(output): "output has holes: " + output;
+ //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
+ final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch);
+ // lookup in hash
+ int ord = words.add(utf8Scratch, hashCode);
+ if (ord < 0) {
+ // already exists in our hash
+ ord = (-ord)-1;
+ //System.out.println(" output=" + output + " old ord=" + ord);
+ } else {
+ //System.out.println(" output=" + output + " new ord=" + ord);
+ }
+ MapEntry e = workingSet.get(input);
+ if (e == null) {
+ e = new MapEntry();
+ workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map
+ }
+ e.ords.add(ord);
+ e.includeOrig |= includeOrig;
+ maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords);
+ maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords);
+ }
+ private int countWords(CharsRef chars) {
+ int wordCount = 1;
+ int upto = chars.offset;
+ final int limit = chars.offset + chars.length;
+ while(upto < limit) {
+ if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) {
+ wordCount++;
+ }
+ }
+ return wordCount;
+ }
+ /**
+ * Add a phrase->phrase synonym mapping.
+ * Phrases are character sequences where words are
+ * separated with character zero (\u0000). Empty words
+ * (two \u0000s in a row) are not allowed in the input nor
+ * the output!
+ *
+ * @param input input phrase
+ * @param output output phrase
+ * @param includeOrig true if the original should be included
+ */
+ public void add(CharsRef input, CharsRef output, boolean includeOrig) {
+ add(input, countWords(input), output, countWords(output), includeOrig);
+ }
+ /**
+ * Builds an {@link SynonymMap} and returns it.
+ */
+ public SynonymMap build() throws IOException {
+ ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ // TODO: are we using the best sharing options?
+ org.apache.lucene.util.fst.Builder builder =
+ new org.apache.lucene.util.fst.Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
+ BytesRef scratch = new BytesRef(64);
+ ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
+ final Set dedupSet;
+ if (dedup) {
+ dedupSet = new HashSet();
+ } else {
+ dedupSet = null;
+ }
+ final byte[] spare = new byte[5];
+ Set keys = workingSet.keySet();
+ CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
+ Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
+ //System.out.println("");
+ for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
+ CharsRef input = sortedKeys[keyIdx];
+ MapEntry output = workingSet.get(input);
+ int numEntries = output.ords.size();
+ // output size, assume the worst case
+ int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
+ scratch.grow(estimatedSize);
+ scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
+ assert scratch.offset == 0;
+ // now write our output data:
+ int count = 0;
+ for (int i = 0; i < numEntries; i++) {
+ if (dedupSet != null) {
+ // box once
+ final Integer ent = output.ords.get(i);
+ if (dedupSet.contains(ent)) {
+ continue;
+ }
+ dedupSet.add(ent);
+ }
+ scratchOutput.writeVInt(output.ords.get(i));
+ count++;
+ }
+ final int pos = scratchOutput.getPosition();
+ scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
+ final int pos2 = scratchOutput.getPosition();
+ final int vIntLen = pos2-pos;
+ // Move the count + includeOrig to the front of the byte[]:
+ System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen);
+ System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
+ System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen);
+ if (dedupSet != null) {
+ dedupSet.clear();
+ }
+ scratch.length = scratchOutput.getPosition() - scratch.offset;
+ //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
+ builder.add(input, new BytesRef(scratch));
+ }
+ FST fst = builder.finish();
+ return new SynonymMap(fst, words, maxHorizontalContext);
- List superset = currMap.synonyms==null ? replacement :
- mergeTokens(Arrays.asList(currMap.synonyms), replacement);
- currMap.synonyms = superset.toArray(new Token[superset.size()]);
- if (includeOrig) currMap.flags |= INCLUDE_ORIG;
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder("<");
- if (synonyms!=null) {
- sb.append("[");
- for (int i=0; i");
- return sb.toString();
- }
- /** Produces a List from a List */
- public static List makeTokens(List strings) {
- List ret = new ArrayList(strings.size());
- for (String str : strings) {
- //Token newTok = new Token(str,0,0,"SYNONYM");
- Token newTok = new Token(str, 0,0,"SYNONYM");
- ret.add(newTok);
- }
- return ret;
- }
- /**
- * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
- * the tokens end up at the same position.
- *
- * Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
- * Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
- *
- */
- public static List mergeTokens(List lst1, List lst2) {
- ArrayList result = new ArrayList();
- if (lst1 ==null || lst2 ==null) {
- if (lst2 != null) result.addAll(lst2);
- if (lst1 != null) result.addAll(lst1);
- return result;
- }
- int pos=0;
- Iterator iter1=lst1.iterator();
- Iterator iter2=lst2.iterator();
- Token tok1 = iter1.hasNext() ? : null;
- Token tok2 = iter2.hasNext() ? : null;
- int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0;
- int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0;
- while(tok1!=null || tok2!=null) {
- while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
- Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
- tok.copyBuffer(tok1.buffer(), 0, tok1.length());
- tok.setPositionIncrement(pos1-pos);
- result.add(tok);
- pos=pos1;
- tok1 = iter1.hasNext() ? : null;
- pos1 += tok1!=null ? tok1.getPositionIncrement() : 0;
- }
- while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
- Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
- tok.copyBuffer(tok2.buffer(), 0, tok2.length());
- tok.setPositionIncrement(pos2-pos);
- result.add(tok);
- pos=pos2;
- tok2 = iter2.hasNext() ? : null;
- pos2 += tok2!=null ? tok2.getPositionIncrement() : 0;
- }
- }
- return result;
- }
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/
new file mode 100644
index 00000000000..20aeea0e362
--- /dev/null
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/
@@ -0,0 +1,112 @@
+package org.apache.lucene.analysis.synonym;
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.text.ParseException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.CharsRef;
+ * Parser for wordnet prolog format
+ *
+ * See for a description of the format.
+ * @lucene.experimental
+ */
+// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
+public class WordnetSynonymParser extends SynonymMap.Builder {
+ private final boolean expand;
+ private final Analyzer analyzer;
+ public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
+ super(dedup);
+ this.expand = expand;
+ this.analyzer = analyzer;
+ }
+ public void add(Reader in) throws IOException, ParseException {
+ LineNumberReader br = new LineNumberReader(in);
+ try {
+ String line = null;
+ String lastSynSetID = "";
+ CharsRef synset[] = new CharsRef[8];
+ int synsetSize = 0;
+ while ((line = br.readLine()) != null) {
+ String synSetID = line.substring(2, 11);
+ if (!synSetID.equals(lastSynSetID)) {
+ addInternal(synset, synsetSize);
+ synsetSize = 0;
+ }
+ if (synset.length <= synsetSize+1) {
+ CharsRef larger[] = new CharsRef[synset.length * 2];
+ System.arraycopy(synset, 0, larger, 0, synsetSize);
+ synset = larger;
+ }
+ synset[synsetSize] = parseSynonym(line, synset[synsetSize]);
+ synsetSize++;
+ lastSynSetID = synSetID;
+ }
+ // final synset in the file
+ addInternal(synset, synsetSize);
+ } catch (IllegalArgumentException e) {
+ ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
+ ex.initCause(e);
+ throw ex;
+ } finally {
+ br.close();
+ }
+ }
+ private CharsRef parseSynonym(String line, CharsRef reuse) throws IOException {
+ if (reuse == null) {
+ reuse = new CharsRef(8);
+ }
+ int start = line.indexOf('\'')+1;
+ int end = line.lastIndexOf('\'');
+ String text = line.substring(start, end).replace("''", "'");
+ return analyze(analyzer, text, reuse);
+ }
+ private void addInternal(CharsRef synset[], int size) throws IOException {
+ if (size <= 1) {
+ return; // nothing to do
+ }
+ if (expand) {
+ for (int i = 0; i < size; i++) {
+ for (int j = 0; j < size; j++) {
+ add(synset[i], synset[j], false);
+ }
+ }
+ } else {
+ for (int i = 0; i < size; i++) {
+ add(synset[i], synset[0], false);
+ }
+ }
+ }
diff --git a/lucene/contrib/wordnet/src/java/overview.html b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/package.html
similarity index 84%
rename from lucene/contrib/wordnet/src/java/overview.html
rename to modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/package.html
index cd05399880b..2fd37e8de20 100644
--- a/lucene/contrib/wordnet/src/java/overview.html
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/package.html
@@ -1,3 +1,4 @@
- wordnet
- wordnet
\ No newline at end of file
+Analysis components for Synonyms.
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/
new file mode 100644
index 00000000000..6260a3d1618
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/
@@ -0,0 +1,144 @@
+package org.apache.lucene.analysis.synonym;
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.text.ParseException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.junit.Test;
+ * Tests parser for the Solr synonyms format
+ * @lucene.experimental
+ */
+public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
+ /** Tests some simple examples from the solr wiki */
+ public void testSimple() throws Exception {
+ String testFile =
+ "i-pod, ipod, ipoooood\n" +
+ "foo => foo bar\n" +
+ "foo => baz\n" +
+ "this test, that testing";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
+ parser.add(new StringReader(testFile));
+ final SynonymMap map =;
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+ }
+ };
+ assertAnalyzesTo(analyzer, "ball",
+ new String[] { "ball" },
+ new int[] { 1 });
+ assertAnalyzesTo(analyzer, "i-pod",
+ new String[] { "i-pod", "ipod", "ipoooood" },
+ new int[] { 1, 0, 0 });
+ assertAnalyzesTo(analyzer, "foo",
+ new String[] { "foo", "baz", "bar" },
+ new int[] { 1, 0, 1 });
+ assertAnalyzesTo(analyzer, "this test",
+ new String[] { "this", "that", "test", "testing" },
+ new int[] { 1, 0, 1, 0 });
+ }
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidDoubleMap() throws Exception {
+ String testFile = "a => b => c";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
+ parser.add(new StringReader(testFile));
+ }
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidAnalyzesToNothingOutput() throws Exception {
+ String testFile = "a => 1";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
+ parser.add(new StringReader(testFile));
+ }
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidAnalyzesToNothingInput() throws Exception {
+ String testFile = "1 => a";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
+ parser.add(new StringReader(testFile));
+ }
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidPositionsInput() throws Exception {
+ String testFile = "testola => the test";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
+ parser.add(new StringReader(testFile));
+ }
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidPositionsOutput() throws Exception {
+ String testFile = "the test => testola";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
+ parser.add(new StringReader(testFile));
+ }
+ /** parse a syn file with some escaped syntax chars */
+ public void testEscapedStuff() throws Exception {
+ String testFile =
+ "a\\=>a => b\\=>b\n" +
+ "a\\,a => b\\,b";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.KEYWORD, false));
+ parser.add(new StringReader(testFile));
+ final SynonymMap map =;
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
+ }
+ };
+ assertAnalyzesTo(analyzer, "ball",
+ new String[] { "ball" },
+ new int[] { 1 });
+ assertAnalyzesTo(analyzer, "a=>a",
+ new String[] { "b=>b" },
+ new int[] { 1 });
+ assertAnalyzesTo(analyzer, "a,a",
+ new String[] { "b,b" },
+ new int[] { 1 });
+ }
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/
new file mode 100644
index 00000000000..ba1b23f5c6b
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/
@@ -0,0 +1,393 @@
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.synonym;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util._TestUtil;
+public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
+ private SynonymMap.Builder b;
+ private Tokenizer tokensIn;
+ private SynonymFilter tokensOut;
+ private CharTermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private OffsetAttribute offsetAtt;
+ private void add(String input, String output, boolean keepOrig) {
+ b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
+ new CharsRef(output.replaceAll(" +", "\u0000")),
+ keepOrig);
+ }
+ private void assertEquals(CharTermAttribute term, String expected) {
+ assertEquals(expected.length(), term.length());
+ final char[] buffer = term.buffer();
+ for(int chIDX=0;chIDX 0) {
+ assertTrue(tokensOut.incrementToken());
+ if (VERBOSE) {
+ System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
+ }
+ }
+ assertEquals(termAtt, expectedAtPos[atPos]);
+ assertEquals(atPos == 0 ? 1 : 0,
+ posIncrAtt.getPositionIncrement());
+ // start/end offset of all tokens at same pos should
+ // be the same:
+ assertEquals(startOffset, offsetAtt.startOffset());
+ assertEquals(endOffset, offsetAtt.endOffset());
+ }
+ }
+ tokensOut.end();
+ tokensOut.close();
+ if (VERBOSE) {
+ System.out.println(" incr: END");
+ }
+ assertEquals(expectedUpto, expected.length);
+ }
+ public void testBasic() throws Exception {
+ b = new SynonymMap.Builder(true);
+ add("a", "foo", true);
+ add("a b", "bar fee", true);
+ add("b c", "dog collar", true);
+ add("c d", "dog harness holder extras", true);
+ add("m c e", "dog barks loudly", false);
+ add("e f", "foo bar", false);
+ add("e f", "baz bee", false);
+ add("z", "boo", false);
+ add("y", "bee", true);
+ tokensIn = new MockTokenizer(new StringReader("a"),
+ MockTokenizer.WHITESPACE,
+ true);
+ tokensIn.reset();
+ assertTrue(tokensIn.incrementToken());
+ assertFalse(tokensIn.incrementToken());
+ tokensIn.end();
+ tokensIn.close();
+ tokensOut = new SynonymFilter(tokensIn,
+ true);
+ termAtt = tokensOut.addAttribute(CharTermAttribute.class);
+ posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
+ verify("a b c", "a/bar b/fee c");
+ // syn output extends beyond input tokens
+ verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");
+ verify("a b a", "a/bar b/fee a/foo");
+ // outputs that add to one another:
+ verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");
+ // two outputs for same input
+ verify("e f", "foo/baz bar/bee");
+ // mixed keepOrig true/false:
+ verify("a m c e x", "a/foo dog barks loudly x");
+ verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x");
+ assertTrue(tokensOut.getCaptureCount() > 0);
+ // no captureStates when no syns matched
+ verify("p q r s t", "p q r s t");
+ assertEquals(0, tokensOut.getCaptureCount());
+ // no captureStates when only single-input syns, w/ no
+ // lookahead needed, matched
+ verify("p q z y t", "p q boo y/bee t");
+ assertEquals(0, tokensOut.getCaptureCount());
+ }
+ private String getRandomString(char start, int alphabetSize, int length) {
+ assert alphabetSize <= 26;
+ char[] s = new char[2*length];
+ for(int charIDX=0;charIDX out;
+ boolean keepOrig;
+ }
+ public String slowSynMatcher(String doc, List syns, int maxOutputLength) {
+ assertTrue(doc.length() % 2 == 0);
+ final int numInputs = doc.length()/2;
+ boolean[] keepOrigs = new boolean[numInputs];
+ Arrays.fill(keepOrigs, false);
+ String[] outputs = new String[numInputs + maxOutputLength];
+ OneSyn[] matches = new OneSyn[numInputs];
+ for(OneSyn syn : syns) {
+ int idx = -1;
+ while(true) {
+ idx = doc.indexOf(, 1+idx);
+ if (idx == -1) {
+ break;
+ }
+ assertTrue(idx % 2 == 0);
+ final int matchIDX = idx/2;
+ assertTrue( % 2 == 1);
+ if (matches[matchIDX] == null) {
+ matches[matchIDX] = syn;
+ } else if ( > matches[matchIDX].in.length()) {
+ // Greedy conflict resolution: longer match wins:
+ matches[matchIDX] = syn;
+ } else {
+ assertTrue( < matches[matchIDX].in.length());
+ }
+ }
+ }
+ // Greedy conflict resolution: if syn matches a range of inputs,
+ // it prevents other syns from matching that range
+ for(int inputIDX=0;inputIDX= numInputs && outputs[inputIDX] == null) {
+ break;
+ }
+ if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) {
+ sb.append(inputTokens[inputIDX]);
+ posHasOutput = true;
+ }
+ if (outputs[inputIDX] != null) {
+ if (posHasOutput) {
+ sb.append('/');
+ }
+ sb.append(outputs[inputIDX]);
+ }
+ if (inputIDX < limit-1) {
+ sb.append(' ');
+ }
+ }
+ return sb.toString();
+ }
+ public void testRandom() throws Exception {
+ final int alphabetSize = _TestUtil.nextInt(random, 2, 7);
+ final int docLen = atLeast(3000);
+ //final int docLen = 50;
+ final String document = getRandomString('a', alphabetSize, docLen);
+ if (VERBOSE) {
+ System.out.println("TEST: doc=" + document);
+ }
+ final int numSyn = atLeast(5);
+ //final int numSyn = 2;
+ final Map synMap = new HashMap();
+ final List syns = new ArrayList();
+ final boolean dedup = random.nextBoolean();
+ if (VERBOSE) {
+ System.out.println(" dedup=" + dedup);
+ }
+ b = new SynonymMap.Builder(dedup);
+ for(int synIDX=0;synIDX();
+ synMap.put(synIn, s);
+ s.keepOrig = random.nextBoolean();
+ }
+ final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim();
+ s.out.add(synOut);
+ add(synIn, synOut, s.keepOrig);
+ if (VERBOSE) {
+ System.out.println(" syns[" + synIDX + "] = " + + " -> " + s.out + " keepOrig=" + s.keepOrig);
+ }
+ }
+ tokensIn = new MockTokenizer(new StringReader("a"),
+ MockTokenizer.WHITESPACE,
+ true);
+ tokensIn.reset();
+ assertTrue(tokensIn.incrementToken());
+ assertFalse(tokensIn.incrementToken());
+ tokensIn.end();
+ tokensIn.close();
+ tokensOut = new SynonymFilter(tokensIn,
+ true);
+ termAtt = tokensOut.addAttribute(CharTermAttribute.class);
+ posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
+ if (dedup) {
+ pruneDups(syns);
+ }
+ final String expected = slowSynMatcher(document, syns, 5);
+ if (VERBOSE) {
+ System.out.println("TEST: expected=" + expected);
+ }
+ verify(document, expected);
+ }
+ private void pruneDups(List syns) {
+ Set seen = new HashSet();
+ for(OneSyn syn : syns) {
+ int idx = 0;
+ while(idx < syn.out.size()) {
+ String out = syn.out.get(idx);
+ if (!seen.contains(out)) {
+ seen.add(out);
+ idx++;
+ } else {
+ syn.out.remove(idx);
+ }
+ }
+ seen.clear();
+ }
+ }
+ private String randomNonEmptyString() {
+ while(true) {
+ final String s = _TestUtil.randomUnicodeString(random).trim();
+ if (s.length() != 0 && s.indexOf('\u0000') == -1) {
+ return s;
+ }
+ }
+ }
+ /** simple random test, doesn't verify correctness.
+ * does verify it doesnt throw exceptions, or that the stream doesn't misbehave
+ */
+ public void testRandom2() throws Exception {
+ final int numIters = atLeast(10);
+ for (int i = 0; i < numIters; i++) {
+ b = new SynonymMap.Builder(random.nextBoolean());
+ final int numEntries = atLeast(10);
+ for (int j = 0; j < numEntries; j++) {
+ add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
+ }
+ final SynonymMap map =;
+ final boolean ignoreCase = random.nextBoolean();
+ final Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
+ }
+ };
+ checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
+ }
+ }
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/
new file mode 100644
index 00000000000..6f1c6329afb
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/
@@ -0,0 +1,72 @@
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.synonym;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
+ Analyzer analyzer;
+ String synonymsFile =
+ "s(100000001,1,'woods',n,1,0).\n" +
+ "s(100000001,2,'wood',n,1,0).\n" +
+ "s(100000001,3,'forest',n,1,0).\n" +
+ "s(100000002,1,'wolfish',n,1,0).\n" +
+ "s(100000002,2,'ravenous',n,1,0).\n" +
+ "s(100000003,1,'king',n,1,1).\n" +
+ "s(100000003,2,'baron',n,1,1).\n" +
+ "s(100000004,1,'king''s evil',n,1,1).\n" +
+ "s(100000004,2,'king''s meany',n,1,1).\n";
+ public void testSynonyms() throws Exception {
+ WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random));
+ parser.add(new StringReader(synonymsFile));
+ final SynonymMap map =;
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
+ }
+ };
+ /* all expansions */
+ assertAnalyzesTo(analyzer, "Lost in the woods",
+ new String[] { "Lost", "in", "the", "woods", "wood", "forest" },
+ new int[] { 0, 5, 8, 12, 12, 12 },
+ new int[] { 4, 7, 11, 17, 17, 17 },
+ new int[] { 1, 1, 1, 1, 0, 0 });
+ /* single quote */
+ assertAnalyzesTo(analyzer, "king",
+ new String[] { "king", "baron" });
+ /* multi words */
+ assertAnalyzesTo(analyzer, "king's evil",
+ new String[] { "king's", "king's", "evil", "meany" });
+ }
diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/
index 48b5d251d85..317090863eb 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/
+++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/
@@ -90,6 +90,10 @@ import;
* "alphabetically" in any of the documentation above indicates utf16 codepoint order,
* nothing else.
+ *
+ * NOTE: the FST file format is experimental and
+ * subject to suddenly change, requiring you to rebuild the
+ * FST suggest index.
public class FSTLookup extends Lookup {
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index e0f2c21ea5a..653bcfa58e6 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -320,6 +320,9 @@ New Features
+* LUCENE-3233: Improved memory usage, build time, and performance of
+ SynonymFilterFactory. (Mike McCandless, Robert Muir)
Bug Fixes
diff --git a/solr/core/src/java/org/apache/solr/analysis/ b/solr/core/src/java/org/apache/solr/analysis/
new file mode 100644
index 00000000000..151f5a9b623
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/analysis/
@@ -0,0 +1,157 @@
+package org.apache.solr.analysis;
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.text.ParseException;
+import java.util.List;
+import java.util.Map;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.synonym.SolrSynonymParser;
+import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.Version;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+ * @deprecated (3.4) use {@link SynonymFilterFactory} instead. this is only a backwards compatibility
+ * mechanism that will be removed in Lucene 5.0
+ */
+// NOTE: rename this to "SynonymFilterFactory" and nuke that delegator in Lucene 5.0!
+final class FSTSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ private SynonymMap map;
+ private boolean ignoreCase;
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new SynonymFilter(input, map, ignoreCase);
+ }
+ @Override
+ public void inform(ResourceLoader loader) {
+ final boolean ignoreCase = getBoolean("ignoreCase", false);
+ this.ignoreCase = ignoreCase;
+ String tf = args.get("tokenizerFactory");
+ final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf, args);
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader);
+ TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer;
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+ String format = args.get("format");
+ try {
+ if (format == null || format.equals("solr")) {
+ // TODO: expose dedup as a parameter?
+ map = loadSolrSynonyms(loader, true, analyzer);
+ } else if (format.equals("wordnet")) {
+ map = loadWordnetSynonyms(loader, true, analyzer);
+ } else {
+ // TODO: somehow make this more pluggable
+ throw new RuntimeException("Unrecognized synonyms format: " + format);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+ /**
+ * Load synonyms from the solr format, "format=solr".
+ */
+ private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
+ final boolean expand = getBoolean("expand", true);
+ String synonyms = args.get("synonyms");
+ if (synonyms == null)
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
+ CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer);
+ File synonymFile = new File(synonyms);
+ if (synonymFile.exists()) {
+ decoder.reset();
+ parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
+ } else {
+ List files = StrUtils.splitFileNames(synonyms);
+ for (String file : files) {
+ decoder.reset();
+ parser.add(new InputStreamReader(loader.openResource(file), decoder));
+ }
+ }
+ return;
+ }
+ /**
+ * Load synonyms from the wordnet format, "format=wordnet".
+ */
+ private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
+ final boolean expand = getBoolean("expand", true);
+ String synonyms = args.get("synonyms");
+ if (synonyms == null)
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
+ CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer);
+ File synonymFile = new File(synonyms);
+ if (synonymFile.exists()) {
+ decoder.reset();
+ parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
+ } else {
+ List files = StrUtils.splitFileNames(synonyms);
+ for (String file : files) {
+ decoder.reset();
+ parser.add(new InputStreamReader(loader.openResource(file), decoder));
+ }
+ }
+ return;
+ }
+ private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){
+ TokenizerFactory tokFactory = (TokenizerFactory) loader.newInstance(cname);
+ tokFactory.init(args);
+ return tokFactory;
+ }
diff --git a/solr/core/src/java/org/apache/solr/analysis/ b/solr/core/src/java/org/apache/solr/analysis/
new file mode 100644
index 00000000000..d97cacda7b6
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/analysis/
@@ -0,0 +1,261 @@
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedList;
+/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
+ *
+ * The matched tokens from the input stream may be optionally passed through (includeOrig=true)
+ * or discarded. If the original tokens are included, the position increments may be modified
+ * to retain absolute positions after merging with the synonym tokenstream.
+ *
+ * Generated synonyms will start at the same position as the first matched source token.
+ * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
+ */
+final class SlowSynonymFilter extends TokenFilter {
+ private final SlowSynonymMap map; // Map
+ private Iterator replacement; // iterator over generated tokens
+ public SlowSynonymFilter(TokenStream in, SlowSynonymMap map) {
+ super(in);
+ if (map == null)
+ throw new IllegalArgumentException("map is required");
+ = map;
+ // just ensuring these attributes exist...
+ addAttribute(CharTermAttribute.class);
+ addAttribute(PositionIncrementAttribute.class);
+ addAttribute(OffsetAttribute.class);
+ addAttribute(TypeAttribute.class);
+ }
+ /*
+ * Need to worry about multiple scenarios:
+ * - need to go for the longest match
+ * a b => foo #shouldn't match if "a b" is followed by "c d"
+ * a b c d => bar
+ * - need to backtrack - retry matches for tokens already read
+ * a b c d => foo
+ * b c => bar
+ * If the input stream is "a b c x", one will consume "a b c d"
+ * trying to match the first rule... all but "a" should be
+ * pushed back so a match may be made on "b c".
+ * - don't try and match generated tokens (thus need separate queue)
+ * matching is not recursive.
+ * - handle optional generation of original tokens in all these cases,
+ * merging token streams to preserve token positions.
+ * - preserve original positionIncrement of first matched token
+ */
+ @Override
+ public boolean incrementToken() throws IOException {
+ while (true) {
+ // if there are any generated tokens, return them... don't try any
+ // matches against them, as we specifically don't want recursion.
+ if (replacement!=null && replacement.hasNext()) {
+ copy(this,;
+ return true;
+ }
+ // common case fast-path of first token not matching anything
+ AttributeSource firstTok = nextTok();
+ if (firstTok == null) return false;
+ CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
+ SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
+ if (result == null) {
+ copy(this, firstTok);
+ return true;
+ }
+ // fast-path failed, clone ourselves if needed
+ if (firstTok == this)
+ firstTok = cloneAttributes();
+ // OK, we matched a token, so find the longest match.
+ matched = new LinkedList();
+ result = match(result);
+ if (result==null) {
+ // no match, simply return the first token read.
+ copy(this, firstTok);
+ return true;
+ }
+ // reuse, or create new one each time?
+ ArrayList generated = new ArrayList(result.synonyms.length + matched.size() + 1);
+ //
+ // there was a match... let's generate the new tokens, merging
+ // in the matched tokens (position increments need adjusting)
+ //
+ AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
+ boolean includeOrig = result.includeOrig();
+ AttributeSource origTok = includeOrig ? firstTok : null;
+ PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
+ int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
+ int repPos=0; // curr position in replacement token stream
+ int pos=0; // current position in merged token stream
+ for (int i=0; i foo/0
+ // should I re-create the gap on the next buffered token?
+ replacement = generated.iterator();
+ // Now return to the top of the loop to read and return the first
+ // generated token.. The reason this is done is that we may have generated
+ // nothing at all, and may need to continue with more matching logic.
+ }
+ }
+ //
+ // Defer creation of the buffer until the first time it is used to
+ // optimize short fields with no matches.
+ //
+ private LinkedList buffer;
+ private LinkedList matched;
+ private boolean exhausted;
+ private AttributeSource nextTok() throws IOException {
+ if (buffer!=null && !buffer.isEmpty()) {
+ return buffer.removeFirst();
+ } else {
+ if (!exhausted && input.incrementToken()) {
+ return this;
+ } else {
+ exhausted = true;
+ return null;
+ }
+ }
+ }
+ private void pushTok(AttributeSource t) {
+ if (buffer==null) buffer=new LinkedList();
+ buffer.addFirst(t);
+ }
+ private SlowSynonymMap match(SlowSynonymMap map) throws IOException {
+ SlowSynonymMap result = null;
+ if (map.submap != null) {
+ AttributeSource tok = nextTok();
+ if (tok != null) {
+ // clone ourselves.
+ if (tok == this)
+ tok = cloneAttributes();
+ // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
+ CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
+ SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
+ if (subMap != null) {
+ // recurse
+ result = match(subMap);
+ }
+ if (result != null) {
+ matched.addFirst(tok);
+ } else {
+ // push back unmatched token
+ pushTok(tok);
+ }
+ }
+ }
+ // if no longer sequence matched, so if this node has synonyms, it's the match.
+ if (result==null && map.synonyms!=null) {
+ result = map;
+ }
+ return result;
+ }
+ private void copy(AttributeSource target, AttributeSource source) {
+ if (target != source)
+ source.copyTo(target);
+ }
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ replacement = null;
+ exhausted = false;
+ }
diff --git a/solr/core/src/java/org/apache/solr/analysis/ b/solr/core/src/java/org/apache/solr/analysis/
new file mode 100644
index 00000000000..3390d0d53c0
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/analysis/
@@ -0,0 +1,188 @@
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+ * Factory for {@link SlowSynonymFilter} (only used with luceneMatchVersion < 3.4)
+ *
+ * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
+ * expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
+ * </analyzer>
+ * </fieldType>
+ * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
+ */
+final class SlowSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ public void inform(ResourceLoader loader) {
+ String synonyms = args.get("synonyms");
+ if (synonyms == null)
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
+ boolean ignoreCase = getBoolean("ignoreCase", false);
+ boolean expand = getBoolean("expand", true);
+ String tf = args.get("tokenizerFactory");
+ TokenizerFactory tokFactory = null;
+ if( tf != null ){
+ tokFactory = loadTokenizerFactory( loader, tf, args );
+ }
+ Iterable wlist=loadRules( synonyms, loader );
+ synMap = new SlowSynonymMap(ignoreCase);
+ parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
+ }
+ /**
+ * @return a list of all rules
+ */
+ protected Iterable loadRules( String synonyms, ResourceLoader loader ) {
+ List wlist=null;
+ try {
+ File synonymFile = new File(synonyms);
+ if (synonymFile.exists()) {
+ wlist = loader.getLines(synonyms);
+ } else {
+ List files = StrUtils.splitFileNames(synonyms);
+ wlist = new ArrayList();
+ for (String file : files) {
+ List lines = loader.getLines(file.trim());
+ wlist.addAll(lines);
+ }
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return wlist;
+ }
+ private SlowSynonymMap synMap;
+ static void parseRules(Iterable rules, SlowSynonymMap map, String mappingSep,
+ String synSep, boolean expansion, TokenizerFactory tokFactory) {
+ int count=0;
+ for (String rule : rules) {
+ // To use regexes, we need an expression that specifies an odd number of chars.
+ // This can't really be done with string.split(), and since we need to
+ // do unescaping at some point anyway, we wouldn't be saving any effort
+ // by using regexes.
+ List mapping = StrUtils.splitSmart(rule, mappingSep, false);
+ List> source;
+ List> target;
+ if (mapping.size() > 2) {
+ throw new RuntimeException("Invalid Synonym Rule:" + rule);
+ } else if (mapping.size()==2) {
+ source = getSynList(mapping.get(0), synSep, tokFactory);
+ target = getSynList(mapping.get(1), synSep, tokFactory);
+ } else {
+ source = getSynList(mapping.get(0), synSep, tokFactory);
+ if (expansion) {
+ // expand to all arguments
+ target = source;
+ } else {
+ // reduce to first argument
+ target = new ArrayList>(1);
+ target.add(source.get(0));
+ }
+ }
+ boolean includeOrig=false;
+ for (List fromToks : source) {
+ count++;
+ for (List toToks : target) {
+ map.add(fromToks,
+ SlowSynonymMap.makeTokens(toToks),
+ includeOrig,
+ true
+ );
+ }
+ }
+ }
+ }
+ // a , b c , d e f => [[a],[b,c],[d,e,f]]
+ private static List> getSynList(String str, String separator, TokenizerFactory tokFactory) {
+ List strList = StrUtils.splitSmart(str, separator, false);
+ // now split on whitespace to get a list of token strings
+ List> synList = new ArrayList>();
+ for (String toks : strList) {
+ List tokList = tokFactory == null ?
+ StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
+ synList.add(tokList);
+ }
+ return synList;
+ }
+ private static List splitByTokenizer(String source, TokenizerFactory tokFactory){
+ StringReader reader = new StringReader( source );
+ TokenStream ts = loadTokenizer(tokFactory, reader);
+ List tokList = new ArrayList();
+ try {
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ while (ts.incrementToken()){
+ if( termAtt.length() > 0 )
+ tokList.add( termAtt.toString() );
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ finally{
+ reader.close();
+ }
+ return tokList;
+ }
+ private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){
+ TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
+ tokFactory.init( args );
+ return tokFactory;
+ }
+ private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
+ return tokFactory.create( reader );
+ }
+ public SlowSynonymMap getSynonymMap() {
+ return synMap;
+ }
+ public SlowSynonymFilter create(TokenStream input) {
+ return new SlowSynonymFilter(input,synMap);
+ }
diff --git a/solr/core/src/java/org/apache/solr/analysis/ b/solr/core/src/java/org/apache/solr/analysis/
new file mode 100644
index 00000000000..21570ae4438
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/analysis/
@@ -0,0 +1,162 @@
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.util.Version;
+import java.util.*;
+/** Mapping rules for use with {@link SlowSynonymFilter}
+ * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
+ */
+class SlowSynonymMap {
+ /** @lucene.internal */
+ public CharArrayMap submap; // recursive: Map
+ /** @lucene.internal */
+ public Token[] synonyms;
+ int flags;
+ static final int INCLUDE_ORIG=0x01;
+ static final int IGNORE_CASE=0x02;
+ public SlowSynonymMap() {}
+ public SlowSynonymMap(boolean ignoreCase) {
+ if (ignoreCase) flags |= IGNORE_CASE;
+ }
+ public boolean includeOrig() { return (flags & INCLUDE_ORIG) != 0; }
+ public boolean ignoreCase() { return (flags & IGNORE_CASE) != 0; }
+ /**
+ * @param singleMatch List, the sequence of strings to match
+ * @param replacement List the list of tokens to use on a match
+ * @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
+ * @param mergeExisting merge the replacement tokens with any other mappings that exist
+ */
+ public void add(List singleMatch, List replacement, boolean includeOrig, boolean mergeExisting) {
+ SlowSynonymMap currMap = this;
+ for (String str : singleMatch) {
+ if (currMap.submap==null) {
+ // for now hardcode at 4.0, as its what the old code did.
+ // would be nice to fix, but shouldn't store a version in each submap!!!
+ currMap.submap = new CharArrayMap(Version.LUCENE_40, 1, ignoreCase());
+ }
+ SlowSynonymMap map = currMap.submap.get(str);
+ if (map==null) {
+ map = new SlowSynonymMap();
+ map.flags |= flags & IGNORE_CASE;
+ currMap.submap.put(str, map);
+ }
+ currMap = map;
+ }
+ if (currMap.synonyms != null && !mergeExisting) {
+ throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
+ }
+ List superset = currMap.synonyms==null ? replacement :
+ mergeTokens(Arrays.asList(currMap.synonyms), replacement);
+ currMap.synonyms = superset.toArray(new Token[superset.size()]);
+ if (includeOrig) currMap.flags |= INCLUDE_ORIG;
+ }
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder("<");
+ if (synonyms!=null) {
+ sb.append("[");
+ for (int i=0; i");
+ return sb.toString();
+ }
+ /** Produces a List from a List */
+ public static List makeTokens(List strings) {
+ List ret = new ArrayList(strings.size());
+ for (String str : strings) {
+ //Token newTok = new Token(str,0,0,"SYNONYM");
+ Token newTok = new Token(str, 0,0,"SYNONYM");
+ ret.add(newTok);
+ }
+ return ret;
+ }
+ /**
+ * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
+ * the tokens end up at the same position.
+ *
+ * Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
+ * Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
+ *
+ */
+ public static List mergeTokens(List lst1, List lst2) {
+ ArrayList result = new ArrayList();
+ if (lst1 ==null || lst2 ==null) {
+ if (lst2 != null) result.addAll(lst2);
+ if (lst1 != null) result.addAll(lst1);
+ return result;
+ }
+ int pos=0;
+ Iterator iter1=lst1.iterator();
+ Iterator iter2=lst2.iterator();
+ Token tok1 = iter1.hasNext() ? : null;
+ Token tok2 = iter2.hasNext() ? : null;
+ int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0;
+ int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0;
+ while(tok1!=null || tok2!=null) {
+ while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
+ Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
+ tok.copyBuffer(tok1.buffer(), 0, tok1.length());
+ tok.setPositionIncrement(pos1-pos);
+ result.add(tok);
+ pos=pos1;
+ tok1 = iter1.hasNext() ? : null;
+ pos1 += tok1!=null ? tok1.getPositionIncrement() : 0;
+ }
+ while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
+ Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
+ tok.copyBuffer(tok2.buffer(), 0, tok2.length());
+ tok.setPositionIncrement(pos2-pos);
+ result.add(tok);
+ pos=pos2;
+ tok2 = iter2.hasNext() ? : null;
+ pos2 += tok2!=null ? tok2.getPositionIncrement() : 0;
+ }
+ }
+ return result;
+ }
diff --git a/solr/core/src/java/org/apache/solr/analysis/ b/solr/core/src/java/org/apache/solr/analysis/
index 3b8e4802b7d..d95fd1855b2 100644
--- a/solr/core/src/java/org/apache/solr/analysis/
+++ b/solr/core/src/java/org/apache/solr/analysis/
@@ -1,189 +1,54 @@
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
package org.apache.solr.analysis;
+import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
-import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
import org.apache.solr.common.ResourceLoader;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
* Factory for {@link SynonymFilter}.
* <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- * <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
- * expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
+ * format="solr" ignoreCase="false" expand="true"
+ * tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
* </analyzer>
* </fieldType>
- *
public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ private BaseTokenFilterFactory delegator;
+ @Override
+ public void init(Map args) {
+ super.init(args);
+ assureMatchVersion();
+ if (luceneMatchVersion.onOrAfter(Version.LUCENE_34)) {
+ delegator = new FSTSynonymFilterFactory();
+ } else {
+ // check if you use the new optional arg "format". this makes no sense for the old one,
+ // as its wired to solr's synonyms format only.
+ if (args.containsKey("format") && !args.get("format").equals("solr")) {
+ throw new IllegalArgumentException("You must specify luceneMatchVersion >= 3.4 to use alternate synonyms formats");
+ }
+ delegator = new SlowSynonymFilterFactory();
+ }
+ delegator.init(args);
+ }
+ @Override
+ public TokenStream create(TokenStream input) {
+ assert delegator != null : "init() was not called!";
+ return delegator.create(input);
+ }
+ @Override
public void inform(ResourceLoader loader) {
- String synonyms = args.get("synonyms");
- if (synonyms == null)
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
- boolean ignoreCase = getBoolean("ignoreCase", false);
- boolean expand = getBoolean("expand", true);
- String tf = args.get("tokenizerFactory");
- TokenizerFactory tokFactory = null;
- if( tf != null ){
- tokFactory = loadTokenizerFactory( loader, tf, args );
- }
- Iterable wlist=loadRules( synonyms, loader );
- synMap = new SynonymMap(ignoreCase);
- parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
- }
- /**
- * @return a list of all rules
- */
- protected Iterable loadRules( String synonyms, ResourceLoader loader ) {
- List wlist=null;
- try {
- File synonymFile = new File(synonyms);
- if (synonymFile.exists()) {
- wlist = loader.getLines(synonyms);
- } else {
- List files = StrUtils.splitFileNames(synonyms);
- wlist = new ArrayList();
- for (String file : files) {
- List lines = loader.getLines(file.trim());
- wlist.addAll(lines);
- }
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- return wlist;
- }
- private SynonymMap synMap;
- static void parseRules(Iterable rules, SynonymMap map, String mappingSep,
- String synSep, boolean expansion, TokenizerFactory tokFactory) {
- int count=0;
- for (String rule : rules) {
- // To use regexes, we need an expression that specifies an odd number of chars.
- // This can't really be done with string.split(), and since we need to
- // do unescaping at some point anyway, we wouldn't be saving any effort
- // by using regexes.
- List mapping = StrUtils.splitSmart(rule, mappingSep, false);
- List> source;
- List> target;
- if (mapping.size() > 2) {
- throw new RuntimeException("Invalid Synonym Rule:" + rule);
- } else if (mapping.size()==2) {
- source = getSynList(mapping.get(0), synSep, tokFactory);
- target = getSynList(mapping.get(1), synSep, tokFactory);
- } else {
- source = getSynList(mapping.get(0), synSep, tokFactory);
- if (expansion) {
- // expand to all arguments
- target = source;
- } else {
- // reduce to first argument
- target = new ArrayList>(1);
- target.add(source.get(0));
- }
- }
- boolean includeOrig=false;
- for (List fromToks : source) {
- count++;
- for (List toToks : target) {
- map.add(fromToks,
- SynonymMap.makeTokens(toToks),
- includeOrig,
- true
- );
- }
- }
- }
- }
- // a , b c , d e f => [[a],[b,c],[d,e,f]]
- private static List> getSynList(String str, String separator, TokenizerFactory tokFactory) {
- List strList = StrUtils.splitSmart(str, separator, false);
- // now split on whitespace to get a list of token strings
- List> synList = new ArrayList>();
- for (String toks : strList) {
- List tokList = tokFactory == null ?
- StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
- synList.add(tokList);
- }
- return synList;
- }
- private static List splitByTokenizer(String source, TokenizerFactory tokFactory){
- StringReader reader = new StringReader( source );
- TokenStream ts = loadTokenizer(tokFactory, reader);
- List tokList = new ArrayList();
- try {
- CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
- while (ts.incrementToken()){
- if( termAtt.length() > 0 )
- tokList.add( termAtt.toString() );
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- finally{
- reader.close();
- }
- return tokList;
- }
- private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){
- TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
- tokFactory.init( args );
- return tokFactory;
- }
- private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
- return tokFactory.create( reader );
- }
- public SynonymMap getSynonymMap() {
- return synMap;
- }
- public SynonymFilter create(TokenStream input) {
- return new SynonymFilter(input,synMap);
+ assert delegator != null : "init() was not called!";
+ ((ResourceLoaderAware) delegator).inform(loader);
diff --git a/solr/core/src/test/org/apache/solr/analysis/ b/solr/core/src/test/org/apache/solr/analysis/
index f0dd0782567..6afda9bed98 100644
--- a/solr/core/src/test/org/apache/solr/analysis/
+++ b/solr/core/src/test/org/apache/solr/analysis/
@@ -17,30 +17,69 @@
package org.apache.solr.analysis;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.lucene.analysis.synonym.SynonymFilter;
-import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.junit.Test;
+import org.apache.solr.common.ResourceLoader;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
* @since solr 1.4
public class TestMultiWordSynonyms extends BaseTokenTestCase {
- @Test
- public void testMultiWordSynonyms() throws IOException {
+ /**
+ * @deprecated Remove this test in 5.0
+ */
+ @Deprecated
+ public void testMultiWordSynonymsOld() throws IOException {
List rules = new ArrayList();
rules.add("a b c,d");
- SynonymMap synMap = new SynonymMap(true);
- SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
+ SlowSynonymMap synMap = new SlowSynonymMap(true);
+ SlowSynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
- SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
+ SlowSynonymFilter ts = new SlowSynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
// This fails because ["e","e"] is the value of the token stream
assertTokenStreamContents(ts, new String[] { "a", "e" });
+ public void testMultiWordSynonyms() throws IOException {
+ SynonymFilterFactory factory = new SynonymFilterFactory();
+ Map args = new HashMap();
+ args.put("synonyms", "synonyms.txt");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader("a b c,d"));
+ TokenStream ts = factory.create(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false));
+ // This fails because ["e","e"] is the value of the token stream
+ assertTokenStreamContents(ts, new String[] { "a", "e" });
+ }
+ private class StringMockSolrResourceLoader implements ResourceLoader {
+ String text;
+ StringMockSolrResourceLoader(String text) {
+ this.text = text;
+ }
+ public List getLines(String resource) throws IOException {
+ return null;
+ }
+ public Object newInstance(String cname, String... subpackages) {
+ return null;
+ }
+ public InputStream openResource(String resource) throws IOException {
+ return new ByteArrayInputStream(text.getBytes("UTF-8"));
+ }
+ }
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/ b/solr/core/src/test/org/apache/solr/analysis/
similarity index 92%
rename from modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/
rename to solr/core/src/test/org/apache/solr/analysis/
index 82c2e1ce6ae..740ad33b17f 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/
+++ b/solr/core/src/test/org/apache/solr/analysis/
@@ -15,7 +15,7 @@
* limitations under the License.
-package org.apache.lucene.analysis.synonym;
+package org.apache.solr.analysis;
@@ -29,51 +29,52 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
+ * @deprecated Remove this test in Lucene 5.0
-public class TestSynonymFilter extends BaseTokenStreamTestCase {
+public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
static List strings(String str) {
String[] arr = str.split(" ");
return Arrays.asList(arr);
- static void assertTokenizesTo(SynonymMap dict, String input,
+ static void assertTokenizesTo(SlowSynonymMap dict, String input,
String expected[]) throws IOException {
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
- SynonymFilter stream = new SynonymFilter(tokenizer, dict);
+ SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected);
- static void assertTokenizesTo(SynonymMap dict, String input,
+ static void assertTokenizesTo(SlowSynonymMap dict, String input,
String expected[], int posIncs[]) throws IOException {
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
- SynonymFilter stream = new SynonymFilter(tokenizer, dict);
+ SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
- static void assertTokenizesTo(SynonymMap dict, List input,
+ static void assertTokenizesTo(SlowSynonymMap dict, List input,
String expected[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
- SynonymFilter stream = new SynonymFilter(tokenizer, dict);
+ SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
- static void assertTokenizesTo(SynonymMap dict, List input,
+ static void assertTokenizesTo(SlowSynonymMap dict, List input,
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
- SynonymFilter stream = new SynonymFilter(tokenizer, dict);
+ SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
public void testMatching() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@@ -110,7 +111,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testIncludeOrig() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = true;
boolean merge = true;
@@ -167,7 +168,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testMapMerge() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@@ -206,7 +207,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testOverlap() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@@ -229,7 +230,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testPositionIncrements() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@@ -264,7 +265,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testPositionIncrementsWithOrig() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = true;
boolean merge = true;
@@ -304,7 +305,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
// x=>y
// analysing "a x" causes "y" to have a bad offset (end less than start)
// SOLR-167
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
diff --git a/solr/core/src/test/org/apache/solr/analysis/ b/solr/core/src/test/org/apache/solr/analysis/
new file mode 100644
index 00000000000..24b4ef505a9
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/analysis/
@@ -0,0 +1,62 @@
+package org.apache.solr.analysis;
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.util.Version;
+import org.apache.solr.core.SolrResourceLoader;
+public class TestSynonymFilterFactory extends BaseTokenTestCase {
+ /** test that we can parse and use the solr syn file */
+ public void testSynonyms() throws Exception {
+ SynonymFilterFactory factory = new SynonymFilterFactory();
+ Map args = new HashMap();
+ args.put("synonyms", "synonyms.txt");
+ factory.init(args);
+ factory.inform(new SolrResourceLoader(null, null));
+ TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
+ assertTrue(ts instanceof SynonymFilter);
+ assertTokenStreamContents(ts,
+ new String[] { "GB", "gib", "gigabyte", "gigabytes" },
+ new int[] { 1, 0, 0, 0 });
+ }
+ /** test that we can parse and use the solr syn file, with the old impl
+ * @deprecated Remove this test in Lucene 5.0 */
+ @Deprecated
+ public void testSynonymsOld() throws Exception {
+ SynonymFilterFactory factory = new SynonymFilterFactory();
+ Map args = new HashMap();
+ args.put("luceneMatchVersion", Version.LUCENE_33.toString());
+ args.put("synonyms", "synonyms.txt");
+ factory.init(args);
+ factory.inform(new SolrResourceLoader(null, null));
+ TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
+ assertTrue(ts instanceof SlowSynonymFilter);
+ assertTokenStreamContents(ts,
+ new String[] { "GB", "gib", "gigabyte", "gigabytes" },
+ new int[] { 1, 0, 0, 0 });
+ }
diff --git a/solr/core/src/test/org/apache/solr/analysis/ b/solr/core/src/test/org/apache/solr/analysis/
index d3a6ee77873..66b3a5c7743 100644
--- a/solr/core/src/test/org/apache/solr/analysis/
+++ b/solr/core/src/test/org/apache/solr/analysis/
@@ -25,32 +25,35 @@ import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.common.ResourceLoader;
+ * @deprecated Remove this test in Lucene 5.0
+ */
public class TestSynonymMap extends LuceneTestCase {
public void testInvalidMappingRules() throws Exception {
- SynonymMap synMap = new SynonymMap( true );
+ SlowSynonymMap synMap = new SlowSynonymMap( true );
List rules = new ArrayList( 1 );
rules.add( "a=>b=>c" );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
fail( "RuntimeException must be thrown." );
catch( RuntimeException expected ){}
public void testReadMappingRules() throws Exception {
- SynonymMap synMap;
+ SlowSynonymMap synMap;
// (a)->[b]
List rules = new ArrayList();
rules.add( "a=>b" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "b" );
@@ -58,8 +61,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (b)->[c]
rules.add( "a,b=>c" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "c" );
assertTokIncludes( synMap, "b", "c" );
@@ -67,8 +70,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (a)->[b][c]
rules.add( "a=>b,c" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "b" );
assertTokIncludes( synMap, "a", "c" );
@@ -78,8 +81,8 @@ public class TestSynonymMap extends LuceneTestCase {
rules.add( "a=>a1" );
rules.add( "a b=>a2" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a1" );
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
@@ -92,8 +95,8 @@ public class TestSynonymMap extends LuceneTestCase {
rules.add( "a=>a1" );
rules.add( "a b=>a2" );
rules.add( "a c=>a3" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a1" );
assertEquals( 2, getSubSynonymMap( synMap, "a" ).submap.size() );
@@ -109,8 +112,8 @@ public class TestSynonymMap extends LuceneTestCase {
rules.add( "a b=>a2" );
rules.add( "b=>b1" );
rules.add( "b c=>b2" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a1" );
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
@@ -121,14 +124,14 @@ public class TestSynonymMap extends LuceneTestCase {
public void testRead1waySynonymRules() throws Exception {
- SynonymMap synMap;
+ SlowSynonymMap synMap;
// (a)->[a]
// (b)->[a]
List rules = new ArrayList();
rules.add( "a,b" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "b", "a" );
@@ -138,8 +141,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (c)->[a]
rules.add( "a,b,c" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 3, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "b", "a" );
@@ -149,8 +152,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (b1)->(b2)->[a]
rules.add( "a,b1 b2" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() );
@@ -160,8 +163,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (b)->[a1][a2]
rules.add( "a1 a2,b" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 2, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
@@ -171,14 +174,14 @@ public class TestSynonymMap extends LuceneTestCase {
public void testRead2waySynonymRules() throws Exception {
- SynonymMap synMap;
+ SlowSynonymMap synMap;
// (a)->[a][b]
// (b)->[a][b]
List rules = new ArrayList();
rules.add( "a,b" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b" );
@@ -190,8 +193,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (c)->[a][b][c]
rules.add( "a,b,c" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 3, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b" );
@@ -209,8 +212,8 @@ public class TestSynonymMap extends LuceneTestCase {
// [b1][b2]
rules.add( "a,b1 b2" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b1" );
@@ -226,8 +229,8 @@ public class TestSynonymMap extends LuceneTestCase {
// [b]
rules.add( "a1 a2,b" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
@@ -239,7 +242,7 @@ public class TestSynonymMap extends LuceneTestCase {
public void testBigramTokenizer() throws Exception {
- SynonymMap synMap;
+ SlowSynonymMap synMap;
// prepare bi-gram tokenizer factory
BaseTokenizerFactory tf = new NGramTokenizerFactory();
@@ -251,8 +254,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (ab)->(bc)->(cd)->[ef][fg][gh]
List rules = new ArrayList();
rules.add( "abcd=>efgh" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
assertEquals( 1, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
@@ -265,7 +268,7 @@ public class TestSynonymMap extends LuceneTestCase {
public void testLoadRules() throws Exception {
Map args = new HashMap();
args.put( "synonyms", "something.txt" );
- SynonymFilterFactory ff = new SynonymFilterFactory();
+ SlowSynonymFilterFactory ff = new SlowSynonymFilterFactory();
ff.inform( new ResourceLoader() {
@@ -289,7 +292,7 @@ public class TestSynonymMap extends LuceneTestCase {
- SynonymMap synMap = ff.getSynonymMap();
+ SlowSynonymMap synMap = ff.getSynonymMap();
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b" );
@@ -298,7 +301,7 @@ public class TestSynonymMap extends LuceneTestCase {
- private void assertTokIncludes( SynonymMap map, String src, String exp ) throws Exception {
+ private void assertTokIncludes( SlowSynonymMap map, String src, String exp ) throws Exception {
Token[] tokens = map.submap.get( src ).synonyms;
boolean inc = false;
for( Token token : tokens ){
@@ -308,7 +311,7 @@ public class TestSynonymMap extends LuceneTestCase {
assertTrue( inc );
- private SynonymMap getSubSynonymMap( SynonymMap map, String src ){
+ private SlowSynonymMap getSubSynonymMap( SlowSynonymMap map, String src ){
return map.submap.get( src );