it = top.iterator();
- while ( it.hasNext())
- {
- // [2a] add to level words in
- String word = it.next();
- TermQuery tq = new TermQuery( new Term( field, word));
- tmp.add( tq, BooleanClause.Occur.SHOULD);
-
- // [2b] add in unique synonums
- syns.search(new TermQuery( new Term(Syns2Index.F_WORD, word)), new Collector() {
- IndexReader reader;
-
- @Override
- public boolean acceptsDocsOutOfOrder() {
- return true;
- }
-
- @Override
- public void collect(int doc) throws IOException {
- Document d = reader.document(doc);
- String[] values = d.getValues( Syns2Index.F_SYN);
- for ( int j = 0; j < values.length; j++)
- {
- String syn = values[ j];
- if ( already.add( syn))
- {
- TermQuery tq = new TermQuery( new Term( field, syn));
- if ( boost > 0) // else keep normal 1.0
- tq.setBoost( boost);
- tmp.add( tq, BooleanClause.Occur.SHOULD);
- }
- }
- }
-
- @Override
- public void setNextReader(AtomicReaderContext context)
- throws IOException {
- this.reader = context.reader;
- }
-
- @Override
- public void setScorer(Scorer scorer) throws IOException {}
- });
- }
-
-
- return tmp;
- }
-
-}
diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymMap.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymMap.java
deleted file mode 100644
index 099d653bef1..00000000000
--- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymMap.java
+++ /dev/null
@@ -1,400 +0,0 @@
-package org.apache.lucene.wordnet;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.TreeMap;
-import java.util.TreeSet;
-
-/**
- * Loads the WordNet prolog file wn_s.pl
- * into a thread-safe main-memory hash map that can be used for fast
- * high-frequency lookups of synonyms for any given (lowercase) word string.
- *
- * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
- * There does not necessarily hold: A -> B, B -> C then A -> C.
- *
- * Loading typically takes some 1.5 secs, so should be done only once per
- * (server) program execution, using a singleton pattern. Once loaded, a
- * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
- * A loaded default synonym map consumes about 10 MB main memory.
- * An instance is immutable, hence thread-safe.
- *
- * This implementation borrows some ideas from the Lucene Syns2Index demo that
- * Dave Spencer originally contributed to Lucene. Dave's approach
- * involved a persistent Lucene index which is suitable for occasional
- * lookups or very large synonym tables, but considered unsuitable for
- * high-frequency lookups of medium size synonym tables.
- *
- * Example Usage:
- *
- * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
- * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
- * for (int i = 0; i < words.length; i++) {
- * String[] synonyms = map.getSynonyms(words[i]);
- * System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
- * }
- *
- *
- * Example output:
- *
- * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
- * woods:[forest, wood]
- * forest:[afforest, timber, timberland, wood, woodland, woods]
- * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
- * xxxx:[]
- *
- *
- *
- * See also:
- * prologdb
- * man page
- * Dave's synonym demo site
- */
-public class SynonymMap {
-
- /** the index data; Map */
- private final HashMap table;
-
- private static final String[] EMPTY = new String[0];
-
- private static final boolean DEBUG = false;
-
- /**
- * Constructs an instance, loading WordNet synonym data from the given input
- * stream. Finally closes the stream. The words in the stream must be in
- * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
- *
- * @param input
- * the stream to read from (null indicates an empty synonym map)
- * @throws IOException
- * if an error occured while reading the stream.
- */
- public SynonymMap(InputStream input) throws IOException {
- this.table = input == null ? new HashMap(0) : read(toByteArray(input));
- }
-
- /**
- * Returns the synonym set for the given word, sorted ascending.
- *
- * @param word
- * the word to lookup (must be in lowercase).
- * @return the synonyms; a set of zero or more words, sorted ascending, each
- * word containing lowercase characters that satisfy
- * Character.isLetter()
.
- */
- public String[] getSynonyms(String word) {
- String[] synonyms = table.get(word);
- if (synonyms == null) return EMPTY;
- String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
- System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
- return copy;
- }
-
- /**
- * Returns a String representation of the index data for debugging purposes.
- *
- * @return a String representation
- */
- @Override
- public String toString() {
- StringBuilder buf = new StringBuilder();
- Iterator iter = new TreeMap(table).keySet().iterator();
- int count = 0;
- int f0 = 0;
- int f1 = 0;
- int f2 = 0;
- int f3 = 0;
-
- while (iter.hasNext()) {
- String word = iter.next();
- buf.append(word + ":");
- String[] synonyms = getSynonyms(word);
- buf.append(Arrays.asList(synonyms));
- buf.append("\n");
- count += synonyms.length;
- if (synonyms.length == 0) f0++;
- if (synonyms.length == 1) f1++;
- if (synonyms.length == 2) f2++;
- if (synonyms.length == 3) f3++;
- }
-
- buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
- return buf.toString();
- }
-
- /**
- * Analyzes/transforms the given word on input stream loading. This default implementation simply
- * lowercases the word. Override this method with a custom stemming
- * algorithm or similar, if desired.
- *
- * @param word
- * the word to analyze
- * @return the same word, or a different word (or null to indicate that the
- * word should be ignored)
- */
- protected String analyze(String word) {
- return word.toLowerCase();
- }
-
- protected boolean isValid(String str) {
- for (int i=str.length(); --i >= 0; ) {
- if (!Character.isLetter(str.charAt(i))) return false;
- }
- return true;
- }
-
- private HashMap read(byte[] data) {
- int WORDS = (int) (76401 / 0.7); // presizing
- int GROUPS = (int) (88022 / 0.7); // presizing
- HashMap> word2Groups = new HashMap>(WORDS); // Map
- HashMap> group2Words = new HashMap>(GROUPS); // Map
- HashMap internedWords = new HashMap(WORDS);// Map
-
- Charset charset = Charset.forName("UTF-8");
- int lastNum = -1;
- Integer lastGroup = null;
- int len = data.length;
- int i=0;
-
- while (i < len) { // until EOF
- /* Part A: Parse a line */
-
- // scan to beginning of group
- while (i < len && data[i] != '(') i++;
- if (i >= len) break; // EOF
- i++;
-
- // parse group
- int num = 0;
- while (i < len && data[i] != ',') {
- num = 10*num + (data[i] - 48);
- i++;
- }
- i++;
-// if (DEBUG) System.err.println("num="+ num);
-
- // scan to beginning of word
- while (i < len && data[i] != '\'') i++;
- i++;
-
- // scan to end of word
- int start = i;
- do {
- while (i < len && data[i] != '\'') i++;
- i++;
- } while (i < len && data[i] != ','); // word must end with "',"
-
- if (i >= len) break; // EOF
- String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
-// String word = new String(data, 0, start, i-start-1); // ASCII
-
- /*
- * Part B: ignore phrases (with spaces and hyphens) and
- * non-alphabetic words, and let user customize word (e.g. do some
- * stemming)
- */
- if (!isValid(word)) continue; // ignore
- word = analyze(word);
- if (word == null || word.length() == 0) continue; // ignore
-
-
- /* Part C: Add (group,word) to tables */
-
- // ensure compact string representation, minimizing memory overhead
- String w = internedWords.get(word);
- if (w == null) {
- word = new String(word); // ensure compact string
- internedWords.put(word, word);
- } else {
- word = w;
- }
-
- Integer group = lastGroup;
- if (num != lastNum) {
- group = Integer.valueOf(num);
- lastGroup = group;
- lastNum = num;
- }
-
- // add word --> group
- ArrayList groups = word2Groups.get(word);
- if (groups == null) {
- groups = new ArrayList(1);
- word2Groups.put(word, groups);
- }
- groups.add(group);
-
- // add group --> word
- ArrayList words = group2Words.get(group);
- if (words == null) {
- words = new ArrayList(1);
- group2Words.put(group, words);
- }
- words.add(word);
- }
-
-
- /* Part D: compute index data structure */
- HashMap word2Syns = createIndex(word2Groups, group2Words);
-
- /* Part E: minimize memory consumption by a factor 3 (or so) */
-// if (true) return word2Syns;
- word2Groups = null; // help gc
- //TODO: word2Groups.clear(); would be more appropriate ?
- group2Words = null; // help gc
- //TODO: group2Words.clear(); would be more appropriate ?
-
- return optimize(word2Syns, internedWords);
- }
-
- private HashMap createIndex(Map> word2Groups, Map> group2Words) {
- HashMap word2Syns = new HashMap();
-
- for (final Map.Entry> entry : word2Groups.entrySet()) { // for each word
- ArrayList group = entry.getValue();
- String word = entry.getKey();
-
-// HashSet synonyms = new HashSet();
- TreeSet synonyms = new TreeSet();
- for (int i=group.size(); --i >= 0; ) { // for each groupID of word
- ArrayList words = group2Words.get(group.get(i));
- for (int j=words.size(); --j >= 0; ) { // add all words
- String synonym = words.get(j); // note that w and word are interned
- if (synonym != word) { // a word is implicitly it's own synonym
- synonyms.add(synonym);
- }
- }
- }
-
- int size = synonyms.size();
- if (size > 0) {
- String[] syns = new String[size];
- if (size == 1)
- syns[0] = synonyms.first();
- else
- synonyms.toArray(syns);
-// if (syns.length > 1) Arrays.sort(syns);
-// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
- word2Syns.put(word, syns);
- }
- }
-
- return word2Syns;
- }
-
- private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
- if (DEBUG) {
- System.err.println("before gc");
- for (int i=0; i < 10; i++) System.gc();
- System.err.println("after gc");
- }
-
- // collect entries
- int len = 0;
- int size = word2Syns.size();
- String[][] allSynonyms = new String[size][];
- String[] words = new String[size];
- Iterator> iter = word2Syns.entrySet().iterator();
- for (int j=0; j < size; j++) {
- Map.Entry entry = iter.next();
- allSynonyms[j] = entry.getValue();
- words[j] = entry.getKey();
- len += words[j].length();
- }
-
- // assemble large string containing all words
- StringBuilder buf = new StringBuilder(len);
- for (int j=0; j < size; j++) buf.append(words[j]);
- String allWords = new String(buf.toString()); // ensure compact string across JDK versions
- buf = null;
-
- // intern words at app level via memory-overlaid substrings
- for (int p=0, j=0; j < size; j++) {
- String word = words[j];
- internedWords.put(word, allWords.substring(p, p + word.length()));
- p += word.length();
- }
-
- // replace words with interned words
- for (int j=0; j < size; j++) {
- String[] syns = allSynonyms[j];
- for (int k=syns.length; --k >= 0; ) {
- syns[k] = internedWords.get(syns[k]);
- }
- word2Syns.remove(words[j]);
- word2Syns.put(internedWords.get(words[j]), syns);
- }
-
- if (DEBUG) {
- words = null;
- allSynonyms = null;
- internedWords = null;
- allWords = null;
- System.err.println("before gc");
- for (int i=0; i < 10; i++) System.gc();
- System.err.println("after gc");
- }
- return word2Syns;
- }
-
- // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
- private static byte[] toByteArray(InputStream input) throws IOException {
- try {
- // safe and fast even if input.available() behaves weird or buggy
- int len = Math.max(256, input.available());
- byte[] buffer = new byte[len];
- byte[] output = new byte[len];
-
- len = 0;
- int n;
- while ((n = input.read(buffer)) >= 0) {
- if (len + n > output.length) { // grow capacity
- byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
- System.arraycopy(output, 0, tmp, 0, len);
- System.arraycopy(buffer, 0, tmp, len, n);
- buffer = output; // use larger buffer for future larger bulk reads
- output = tmp;
- } else {
- System.arraycopy(buffer, 0, output, len, n);
- }
- len += n;
- }
-
- if (len == output.length) return output;
- buffer = null; // help gc
- buffer = new byte[len];
- System.arraycopy(output, 0, buffer, 0, len);
- return buffer;
- } finally {
- input.close();
- }
- }
-
-}
diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymTokenFilter.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymTokenFilter.java
deleted file mode 100644
index e4b45a0c691..00000000000
--- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymTokenFilter.java
+++ /dev/null
@@ -1,148 +0,0 @@
-package org.apache.lucene.wordnet;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.util.AttributeSource;
-
-/**
- * Injects additional tokens for synonyms of token terms fetched from the
- * underlying child stream; the child stream must deliver lowercase tokens
- * for synonyms to be found.
- *
- */
-public class SynonymTokenFilter extends TokenFilter {
-
- /** The Token.type used to indicate a synonym to higher level filters. */
- public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
-
- private final SynonymMap synonyms;
- private final int maxSynonyms;
-
- private String[] stack = null;
- private int index = 0;
- private AttributeSource.State current = null;
- private int todo = 0;
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-
- /**
- * Creates an instance for the given underlying stream and synonym table.
- *
- * @param input
- * the underlying child token stream
- * @param synonyms
- * the map used to extract synonyms for terms
- * @param maxSynonyms
- * the maximum number of synonym tokens to return per underlying
- * token word (a value of Integer.MAX_VALUE indicates unlimited)
- */
- public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
- super(input);
- if (input == null)
- throw new IllegalArgumentException("input must not be null");
- if (synonyms == null)
- throw new IllegalArgumentException("synonyms must not be null");
- if (maxSynonyms < 0)
- throw new IllegalArgumentException("maxSynonyms must not be negative");
-
- this.synonyms = synonyms;
- this.maxSynonyms = maxSynonyms;
- }
-
- /** Returns the next token in the stream, or null at EOS. */
- @Override
- public final boolean incrementToken() throws IOException {
- while (todo > 0 && index < stack.length) { // pop from stack
- if (createToken(stack[index++], current)) {
- todo--;
- return true;
- }
- }
-
- if (!input.incrementToken()) return false; // EOS; iterator exhausted
-
- stack = synonyms.getSynonyms(termAtt.toString()); // push onto stack
- if (stack.length > maxSynonyms) randomize(stack);
- index = 0;
- current = captureState();
- todo = maxSynonyms;
- return true;
- }
-
- /**
- * Creates and returns a token for the given synonym of the current input
- * token; Override for custom (stateless or stateful) behavior, if desired.
- *
- * @param synonym
- * a synonym for the current token's term
- * @param current
- * the current token from the underlying child stream
- * @return a new token, or null to indicate that the given synonym should be
- * ignored
- */
- protected boolean createToken(String synonym, AttributeSource.State current) {
- restoreState(current);
- termAtt.setEmpty().append(synonym);
- typeAtt.setType(SYNONYM_TOKEN_TYPE);
- posIncrAtt.setPositionIncrement(0);
- return true;
- }
-
- /**
- * Randomize synonyms to later sample a subset. Uses constant random seed
- * for reproducibility. Uses "DRand", a simple, fast, uniform pseudo-random
- * number generator with medium statistical quality (multiplicative
- * congruential method), producing integers in the range [Integer.MIN_VALUE,
- * Integer.MAX_VALUE].
- */
- private static void randomize(Object[] arr) {
- int seed = 1234567; // constant
- int randomState = 4*seed + 1;
-// Random random = new Random(seed); // unnecessary overhead
- int len = arr.length;
- for (int i=0; i < len-1; i++) {
- randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
- int r = randomState % (len-i);
- if (r < 0) r = -r; // e.g. -9 % 2 == -1
-// int r = random.nextInt(len-i);
-
- // swap arr[i, i+r]
- Object tmp = arr[i];
- arr[i] = arr[i + r];
- arr[i + r] = tmp;
- }
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- stack = null;
- index = 0;
- current = null;
- todo = 0;
- }
-}
diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/Syns2Index.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/Syns2Index.java
deleted file mode 100644
index 8d3ea0c3d60..00000000000
--- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/Syns2Index.java
+++ /dev/null
@@ -1,329 +0,0 @@
-package org.apache.lucene.wordnet;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-import java.io.PrintStream;
-import java.io.Reader;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.TieredMergePolicy;
-import org.apache.lucene.index.IndexWriterConfig.OpenMode;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Version;
-
-/**
- * Convert the prolog file wn_s.pl from the WordNet prolog download
- * into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}).
- *
- * This has been tested with WordNet 2.0.
- *
- * The index has fields named "word" ({@link #F_WORD})
- * and "syn" ({@link #F_SYN}).
- *
- * The source word (such as 'big') can be looked up in the
- * "word" field, and if present there will be fields named "syn"
- * for every synonym. What's tricky here is that there could be multiple
- * fields with the same name, in the general case for words that have multiple synonyms.
- * That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues}
- *
- *
- * While the WordNet file distinguishes groups of synonyms with
- * related meanings we don't do that here.
- *
- *
- * This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
- *
- * @see WordNet home page
- * @see prologdb man page
- * @see sample site that uses it
- */
-public class Syns2Index
-{
- /**
- *
- */
- private static final PrintStream o = System.out;
-
- /**
- *
- */
- private static final PrintStream err = System.err;
-
- /**
- *
- */
- public static final String F_SYN = "syn";
-
- /**
- *
- */
- public static final String F_WORD = "word";
-
- /**
- * we don't actually analyze any text (only a NOT_ANALYZED field),
- * but analyzer can't be null, docinverter wants the offset gap!
- */
- private static final Analyzer ana = new Analyzer() {
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return null;
- }
- };
-
- /**
- * Takes arg of prolog file name and index directory.
- */
- public static void main(String[] args)
- throws Throwable
- {
- // get command line arguments
- String prologFilename = null; // name of file "wn_s.pl"
- String indexDir = null;
- if (args.length == 2)
- {
- prologFilename = args[0];
- indexDir = args[1];
- }
- else
- {
- usage();
- System.exit(1);
- }
-
- // ensure that the prolog file is readable
- if (! (new File(prologFilename)).canRead())
- {
- err.println("Error: cannot read Prolog file: " + prologFilename);
- System.exit(1);
- }
- // exit if the target index directory already exists
- if ((new File(indexDir)).isDirectory())
- {
- err.println("Error: index directory already exists: " + indexDir);
- err.println("Please specify a name of a non-existent directory");
- System.exit(1);
- }
-
- o.println("Opening Prolog file " + prologFilename);
- final FileInputStream fis = new FileInputStream(prologFilename);
- final BufferedReader br = new BufferedReader(new InputStreamReader(fis));
- String line;
-
- // maps a word to all the "groups" it's in
- final Map> word2Nums = new TreeMap>();
- // maps a group to all the words in it
- final Map> num2Words = new TreeMap>();
- // number of rejected words
- int ndecent = 0;
-
- // status output
- int mod = 1;
- int row = 1;
- // parse prolog file
- o.println( "[1/2] Parsing " + prologFilename);
- while ((line = br.readLine()) != null)
- {
- // occasional progress
- if ((++row) % mod == 0) // periodically print out line we read in
- {
- mod *= 2;
- o.println("\t" + row + " " + line + " " + word2Nums.size()
- + " " + num2Words.size() + " ndecent=" + ndecent);
- }
-
- // syntax check
- if (! line.startsWith("s("))
- {
- err.println("OUCH: " + line);
- System.exit(1);
- }
-
- // parse line
- line = line.substring(2);
- int comma = line.indexOf(',');
- String num = line.substring(0, comma);
- int q1 = line.indexOf('\'');
- line = line.substring(q1 + 1);
- int q2 = line.lastIndexOf('\'');
- String word = line.substring(0, q2).toLowerCase().replace("''", "'");
-
- // make sure is a normal word
- if (! isDecent(word))
- {
- ndecent++;
- continue; // don't store words w/ spaces
- }
-
- // 1/2: word2Nums map
- // append to entry or add new one
- List lis = word2Nums.get(word);
- if (lis == null)
- {
- lis = new LinkedList();
- lis.add(num);
- word2Nums.put(word, lis);
- }
- else
- lis.add(num);
-
- // 2/2: num2Words map
- lis = num2Words.get(num);
- if (lis == null)
- {
- lis = new LinkedList();
- lis.add(word);
- num2Words.put(num, lis);
- }
- else
- lis.add(word);
- }
-
- // close the streams
- fis.close();
- br.close();
-
- // create the index
- o.println( "[2/2] Building index to store synonyms, " +
- " map sizes are " + word2Nums.size() + " and " + num2Words.size());
- index(indexDir, word2Nums, num2Words);
- }
-
- /**
- * Checks to see if a word contains only alphabetic characters by
- * checking it one character at a time.
- *
- * @param s string to check
- * @return true
if the string is decent
- */
- private static boolean isDecent(String s)
- {
- int len = s.length();
- for (int i = 0; i < len; i++)
- {
- if (!Character.isLetter(s.charAt(i)))
- {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Forms a Lucene index based on the 2 maps.
- *
- * @param indexDir the directory where the index should be created
- * @param word2Nums
- * @param num2Words
- */
- private static void index(String indexDir, Map> word2Nums, Map> num2Words)
- throws Throwable
- {
- int row = 0;
- int mod = 1;
- FSDirectory dir = FSDirectory.open(new File(indexDir));
- try {
-
- // override the specific index if it already exists
- IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
- Version.LUCENE_CURRENT, ana).setOpenMode(OpenMode.CREATE));
- ((TieredMergePolicy) writer.getConfig().getMergePolicy()).setUseCompoundFile(true); // why?
- Iterator i1 = word2Nums.keySet().iterator();
- while (i1.hasNext()) // for each word
- {
- String g = i1.next();
- Document doc = new Document();
-
- int n = index(word2Nums, num2Words, g, doc);
- if (n > 0)
- {
- doc.add( new Field( F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED));
- if ((++row % mod) == 0)
- {
- o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc);
- mod *= 2;
- }
- writer.addDocument(doc);
- } // else degenerate
- }
- o.println( "Optimizing..");
- writer.optimize();
- writer.close();
- } finally {
- dir.close();
- }
- }
-
- /**
- * Given the 2 maps fills a document for 1 word.
- */
- private static int index(Map> word2Nums, Map> num2Words, String g, Document doc)
- throws Throwable
- {
- List keys = word2Nums.get(g); // get list of key#'s
- Iterator i2 = keys.iterator();
-
- Set already = new TreeSet(); // keep them sorted
-
- // pass 1: fill up 'already' with all words
- while (i2.hasNext()) // for each key#
- {
- already.addAll(num2Words.get(i2.next())); // get list of words
- }
- int num = 0;
- already.remove(g); // of course a word is it's own syn
- Iterator it = already.iterator();
- while (it.hasNext())
- {
- String cur = it.next();
- // don't store things like 'pit bull' -> 'american pit bull'
- if (!isDecent(cur))
- {
- continue;
- }
- num++;
- doc.add( new Field( F_SYN, cur, Field.Store.YES, Field.Index.NO));
- }
- return num;
- }
-
- /**
- *
- */
- private static void usage()
- {
- o.println("\n\n" +
- "java org.apache.lucene.wordnet.Syns2Index \n\n");
- }
-
-}
diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/package.html b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/package.html
deleted file mode 100755
index 19c5b579ba4..00000000000
--- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/package.html
+++ /dev/null
@@ -1,57 +0,0 @@
-
-
-
-
-WordNet Lucene Synonyms Integration
-
-
-
- This package uses synonyms defined by WordNet.
- There are two methods: query expansion and analysis.
-
- Both methods first require you to download the WordNet prolog database
- Inside this archive is a file named wn_s.pl, which contains the WordNet synonyms.
-
- Query Expansion Method
- This method creates Lucene index storing the synonyms, which in turn can be used for query expansion.
-
- You normally run {@link org.apache.lucene.wordnet.Syns2Index} once to build the query index/"database", and then call
- {@link org.apache.lucene.wordnet.SynExpand#expand SynExpand.expand(...)} to expand a query.
-
-
-
-
Instructions
-
- - Invoke Syn2Index as appropriate to build a synonym index.
- It'll take 2 arguments, the path to wn_s.pl from the WordNet download, and the index name.
-
-
- Update your UI so that as appropriate you call SynExpand.expand(...) to expand user queries with synonyms.
-
-
- Analysis Method
- This method injects additional synonym tokens for tokens from a child {@link org.apache.lucene.analysis.TokenStream}.
-
- Instructions
-
- - Create a {@link org.apache.lucene.wordnet.SynonymMap}, passing in the path to wn_s.pl
-
- Add a {@link org.apache.lucene.wordnet.SynonymTokenFilter} to your analyzer. Note: SynonymTokenFilter should be after LowerCaseFilter,
- because it expects terms to already be in lowercase.
-
-
-
-
\ No newline at end of file
diff --git a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java b/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java
deleted file mode 100644
index 6959a3ed0a8..00000000000
--- a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java
+++ /dev/null
@@ -1,119 +0,0 @@
-package org.apache.lucene.wordnet;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.Reader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-
-public class TestSynonymTokenFilter extends BaseTokenStreamTestCase {
- final String testFile = "testSynonyms.txt";
-
- public void testSynonyms() throws Exception {
- SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
- /* all expansions */
- Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
- assertAnalyzesTo(analyzer, "Lost in the woods",
- new String[] { "lost", "in", "the", "woods", "forest", "wood" },
- new int[] { 0, 5, 8, 12, 12, 12 },
- new int[] { 4, 7, 11, 17, 17, 17 },
- new int[] { 1, 1, 1, 1, 0, 0 });
- }
-
- public void testSynonymsSingleQuote() throws Exception {
- SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
- /* all expansions */
- Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
- assertAnalyzesTo(analyzer, "king",
- new String[] { "king", "baron" });
- }
-
- public void testSynonymsLimitedAmount() throws Exception {
- SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
- /* limit to one synonym expansion */
- Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
- assertAnalyzesTo(analyzer, "Lost in the woods",
- /* wood comes before forest due to
- * the input file, not lexicographic order
- */
- new String[] { "lost", "in", "the", "woods", "wood" },
- new int[] { 0, 5, 8, 12, 12 },
- new int[] { 4, 7, 11, 17, 17 },
- new int[] { 1, 1, 1, 1, 0 });
- }
-
- public void testReusableTokenStream() throws Exception {
- SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
- /* limit to one synonym expansion */
- Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
- assertAnalyzesToReuse(analyzer, "Lost in the woods",
- new String[] { "lost", "in", "the", "woods", "wood" },
- new int[] { 0, 5, 8, 12, 12 },
- new int[] { 4, 7, 11, 17, 17 },
- new int[] { 1, 1, 1, 1, 0 });
- assertAnalyzesToReuse(analyzer, "My wolfish dog went to the forest",
- new String[] { "my", "wolfish", "ravenous", "dog", "went", "to",
- "the", "forest", "woods" },
- new int[] { 0, 3, 3, 11, 15, 20, 23, 27, 27 },
- new int[] { 2, 10, 10, 14, 19, 22, 26, 33, 33 },
- new int[] { 1, 1, 0, 1, 1, 1, 1, 1, 0 });
- }
-
- private class SynonymWhitespaceAnalyzer extends Analyzer {
- private SynonymMap synonyms;
- private int maxSynonyms;
-
- public SynonymWhitespaceAnalyzer(SynonymMap synonyms, int maxSynonyms) {
- this.synonyms = synonyms;
- this.maxSynonyms = maxSynonyms;
- }
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream ts = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
- ts = new SynonymTokenFilter(ts, synonyms, maxSynonyms);
- return ts;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- }
-
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
- streams.result = new SynonymTokenFilter(streams.source, synonyms, maxSynonyms);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
- }
- }
-
-}
diff --git a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestWordnet.java b/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestWordnet.java
deleted file mode 100644
index ccd855931a5..00000000000
--- a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestWordnet.java
+++ /dev/null
@@ -1,94 +0,0 @@
-package org.apache.lucene.wordnet;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util._TestUtil;
-
-public class TestWordnet extends LuceneTestCase {
- private IndexSearcher searcher;
- private Directory dir;
-
- String storePathName = new File(TEMP_DIR,"testLuceneWordnet").getAbsolutePath();
-
- @Override
- public void setUp() throws Exception {
- super.setUp();
- // create a temporary synonym index
- File testFile = getDataFile("testSynonyms.txt");
- String commandLineArgs[] = { testFile.getAbsolutePath(), storePathName };
- _TestUtil.rmDir(new File(storePathName));
-
- try {
- Syns2Index.main(commandLineArgs);
- } catch (Throwable t) { throw new RuntimeException(t); }
-
- dir = newFSDirectory(new File(storePathName));
- searcher = new IndexSearcher(dir, true);
- }
-
- public void testExpansion() throws IOException {
- assertExpandsTo("woods", new String[] { "woods", "forest", "wood" });
- }
-
- public void testExpansionSingleQuote() throws IOException {
- assertExpandsTo("king", new String[] { "king", "baron" });
- }
-
- private void assertExpandsTo(String term, String expected[]) throws IOException {
- Query expandedQuery = SynExpand.expand(term, searcher, new
- MockAnalyzer(random), "field", 1F);
- BooleanQuery expectedQuery = new BooleanQuery();
- for (String t : expected)
- expectedQuery.add(new TermQuery(new Term("field", t)),
- BooleanClause.Occur.SHOULD);
- assertEquals(expectedQuery, expandedQuery);
- }
-
- @Override
- public void tearDown() throws Exception {
- if (searcher != null) {
- searcher.close();
- }
- if (dir != null) {
- dir.close();
- }
- rmDir(storePathName); // delete our temporary synonym index
- super.tearDown();
- }
-
- private void rmDir(String directory) {
- File dir = new File(directory);
- File[] files = dir.listFiles();
- for (int i = 0; i < files.length; i++) {
- files[i].delete();
- }
- dir.delete();
- }
-}
diff --git a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/testSynonyms.txt b/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/testSynonyms.txt
deleted file mode 100644
index 822bc96858c..00000000000
--- a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/testSynonyms.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-s(100000001,1,'woods',n,1,0).
-s(100000001,2,'wood',n,1,0).
-s(100000001,3,'forest',n,1,0).
-s(100000002,1,'wolfish',n,1,0).
-s(100000002,2,'ravenous',n,1,0).
-s(100000003,1,'king',n,1,1).
-s(100000003,2,'baron',n,1,1).
-s(100000004,1,'king''sevil',n,1,1).
-s(100000004,2,'meany',n,1,1).
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java
index 0c6f51493c8..78f39da6523 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java
@@ -95,9 +95,6 @@ public class MemoryCodec extends Codec {
this.out = out;
this.field = field;
builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs);
-
- // The byte[] output we create can easily be > 255 bytes:
- builder.setAllowArrayArcs(false);
}
private class PostingsWriter extends PostingsConsumer {
diff --git a/lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java b/lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java
new file mode 100644
index 00000000000..0c0a92145d9
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java
@@ -0,0 +1,52 @@
+package org.apache.lucene.store;
+
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * @lucene.experimental
+ */
+public class ByteArrayDataOutput extends DataOutput {
+ private byte[] bytes;
+
+ private int pos;
+ private int limit;
+
+ public ByteArrayDataOutput(byte[] bytes) {
+ reset(bytes);
+ }
+
+ public ByteArrayDataOutput(byte[] bytes, int offset, int len) {
+ reset(bytes, offset, len);
+ }
+
+ public ByteArrayDataOutput() {
+ reset(BytesRef.EMPTY_BYTES);
+ }
+
+ public void reset(byte[] bytes) {
+ reset(bytes, 0, bytes.length);
+ }
+
+ public void reset(byte[] bytes, int offset, int len) {
+ this.bytes = bytes;
+ pos = offset;
+ limit = offset + len;
+ }
+
+ public int getPosition() {
+ return pos;
+ }
+
+ @Override
+ public void writeByte(byte b) {
+ assert pos < limit;
+ bytes[pos++] = b;
+ }
+
+ @Override
+ public void writeBytes(byte[] b, int offset, int length) {
+ assert pos + length <= limit;
+ System.arraycopy(b, offset, bytes, pos, length);
+ pos += length;
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/util/CharsRef.java b/lucene/src/java/org/apache/lucene/util/CharsRef.java
index 2d87a0dabfd..088d9faaa90 100644
--- a/lucene/src/java/org/apache/lucene/util/CharsRef.java
+++ b/lucene/src/java/org/apache/lucene/util/CharsRef.java
@@ -1,5 +1,7 @@
package org.apache.lucene.util;
+import java.util.Comparator;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -167,7 +169,11 @@ public final class CharsRef implements Comparable, CharSequence {
* the {@link CharsRef} to copy
*/
public void copy(CharsRef other) {
- chars = ArrayUtil.grow(chars, other.length);
+ if (chars == null) {
+ chars = new char[other.length];
+ } else {
+ chars = ArrayUtil.grow(chars, other.length);
+ }
System.arraycopy(other.chars, other.offset, chars, 0, other.length);
length = other.length;
offset = 0;
@@ -213,4 +219,56 @@ public final class CharsRef implements Comparable, CharSequence {
public CharSequence subSequence(int start, int end) {
return new CharsRef(chars, offset + start, offset + end - 1);
}
+
+ private final static Comparator utf16SortedAsUTF8SortOrder = new UTF16SortedAsUTF8Comparator();
+
+ public static Comparator getUTF16SortedAsUTF8Comparator() {
+ return utf16SortedAsUTF8SortOrder;
+ }
+
+ private static class UTF16SortedAsUTF8Comparator implements Comparator {
+ // Only singleton
+ private UTF16SortedAsUTF8Comparator() {};
+
+ public int compare(CharsRef a, CharsRef b) {
+ if (a == b)
+ return 0;
+
+ final char[] aChars = a.chars;
+ int aUpto = a.offset;
+ final char[] bChars = b.chars;
+ int bUpto = b.offset;
+
+ final int aStop = aUpto + Math.min(a.length, b.length);
+
+ while (aUpto < aStop) {
+ char aChar = aChars[aUpto++];
+ char bChar = bChars[bUpto++];
+ if (aChar != bChar) {
+ // http://icu-project.org/docs/papers/utf16_code_point_order.html
+
+ /* aChar != bChar, fix up each one if they're both in or above the surrogate range, then compare them */
+ if (aChar >= 0xd800 && bChar >= 0xd800) {
+ if (aChar >= 0xe000) {
+ aChar -= 0x800;
+ } else {
+ aChar += 0x2000;
+ }
+
+ if (bChar >= 0xe000) {
+ bChar -= 0x800;
+ } else {
+ bChar += 0x2000;
+ }
+ }
+
+ /* now aChar and bChar are in code point order */
+ return (int)aChar - (int)bChar; /* int must be 32 bits wide */
+ }
+ }
+
+ // One is a prefix of the other, or, they are equal:
+ return a.length - b.length;
+ }
+ }
}
\ No newline at end of file
diff --git a/lucene/src/java/org/apache/lucene/util/fst/FST.java b/lucene/src/java/org/apache/lucene/util/fst/FST.java
index 7fa3339a256..ccc49cd0f87 100644
--- a/lucene/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/src/java/org/apache/lucene/util/fst/FST.java
@@ -71,7 +71,11 @@ public class FST {
// Increment version to change it
private final static String FILE_FORMAT_NAME = "FST";
private final static int VERSION_START = 0;
- private final static int VERSION_CURRENT = VERSION_START;
+
+ /** Changed numBytesPerArc for array'd case from byte to int. */
+ private final static int VERSION_INT_NUM_BYTES_PER_ARC = 1;
+
+ private final static int VERSION_CURRENT = VERSION_INT_NUM_BYTES_PER_ARC;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
@@ -106,6 +110,8 @@ public class FST {
private boolean allowArrayArcs = true;
+ private Arc cachedRootArcs[];
+
public final static class Arc {
public int label;
public T output;
@@ -113,7 +119,7 @@ public class FST {
int target;
byte flags;
- T nextFinalOutput;
+ public T nextFinalOutput;
int nextArc;
// This is non-zero if current arcs are fixed array:
@@ -176,7 +182,7 @@ public class FST {
public FST(DataInput in, Outputs outputs) throws IOException {
this.outputs = outputs;
writer = null;
- CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_START);
+ CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_INT_NUM_BYTES_PER_ARC, VERSION_INT_NUM_BYTES_PER_ARC);
if (in.readByte() == 1) {
// accepts empty string
int numBytes = in.readVInt();
@@ -209,6 +215,8 @@ public class FST {
bytes = new byte[in.readVInt()];
in.readBytes(bytes, 0, bytes.length);
NO_OUTPUT = outputs.getNoOutput();
+
+ cacheRootArcs();
}
public INPUT_TYPE getInputType() {
@@ -220,7 +228,7 @@ public class FST {
return bytes.length;
}
- void finish(int startNode) {
+ void finish(int startNode) throws IOException {
if (startNode == FINAL_END_NODE && emptyOutput != null) {
startNode = 0;
}
@@ -231,6 +239,32 @@ public class FST {
System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite);
bytes = finalBytes;
this.startNode = startNode;
+
+ cacheRootArcs();
+ }
+
+ // Caches first 128 labels
+ @SuppressWarnings("unchecked")
+ private void cacheRootArcs() throws IOException {
+ cachedRootArcs = (FST.Arc[]) new FST.Arc[0x80];
+ final FST.Arc arc = new FST.Arc();
+ getFirstArc(arc);
+ final BytesReader in = getBytesReader(0);
+ if (targetHasArcs(arc)) {
+ readFirstRealArc(arc.target, arc);
+ while(true) {
+ assert arc.label != END_LABEL;
+ if (arc.label < cachedRootArcs.length) {
+ cachedRootArcs[arc.label] = new Arc().copyFrom(arc);
+ } else {
+ break;
+ }
+ if (arc.isLast()) {
+ break;
+ }
+ readNextRealArc(arc, in);
+ }
+ }
}
void setEmptyOutput(T v) throws IOException {
@@ -345,8 +379,9 @@ public class FST {
writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY);
writer.writeVInt(node.numArcs);
// placeholder -- we'll come back and write the number
- // of bytes per arc here:
- writer.writeByte((byte) 0);
+ // of bytes per arc (int) here:
+ // TODO: we could make this a vInt instead
+ writer.writeInt(0);
fixedArrayStart = writer.posWrite;
//System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart);
} else {
@@ -421,15 +456,21 @@ public class FST {
}
}
+ // TODO: if arc'd arrays will be "too wasteful" by some
+ // measure, eg if arcs have vastly different sized
+ // outputs, then we should selectively disable array for
+ // such cases
+
if (doFixedArray) {
assert maxBytesPerArc > 0;
// 2nd pass just "expands" all arcs to take up a fixed
// byte size
final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc;
bytes = ArrayUtil.grow(bytes, sizeNeeded);
- if (maxBytesPerArc > 255) {
- throw new IllegalStateException("max arc size is too large (" + maxBytesPerArc + "); disable array arcs by calling Builder.setAllowArrayArcs(false)");
- }
+ // TODO: we could make this a vInt instead
+ bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24);
+ bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16);
+ bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8);
bytes[fixedArrayStart-1] = (byte) maxBytesPerArc;
// expand the arcs in place, backwards
@@ -502,7 +543,7 @@ public class FST {
if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
// array: jump straight to end
arc.numArcs = in.readVInt();
- arc.bytesPerArc = in.readByte() & 0xFF;
+ arc.bytesPerArc = in.readInt();
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
arc.posArcsStart = in.pos;
arc.arcIdx = arc.numArcs - 2;
@@ -528,7 +569,7 @@ public class FST {
}
arc.nextArc = in.pos+1;
}
- readNextRealArc(arc);
+ readNextRealArc(arc, in);
assert arc.isLast();
return arc;
}
@@ -572,7 +613,7 @@ public class FST {
//System.out.println(" fixedArray");
// this is first arc in a fixed-array
arc.numArcs = in.readVInt();
- arc.bytesPerArc = in.readByte() & 0xFF;
+ arc.bytesPerArc = in.readInt();
arc.arcIdx = -1;
arc.nextArc = arc.posArcsStart = in.pos;
//System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
@@ -580,7 +621,7 @@ public class FST {
arc.nextArc = address;
arc.bytesPerArc = 0;
}
- return readNextRealArc(arc);
+ return readNextRealArc(arc, in);
}
/**
@@ -609,7 +650,7 @@ public class FST {
}
return readFirstRealArc(arc.nextArc, arc);
} else {
- return readNextRealArc(arc);
+ return readNextRealArc(arc, getBytesReader(0));
}
}
@@ -627,7 +668,7 @@ public class FST {
//System.out.println(" nextArc fake array");
in.pos--;
in.readVInt();
- in.readByte();
+ in.readInt();
}
} else {
if (arc.bytesPerArc != 0) {
@@ -645,17 +686,16 @@ public class FST {
return readLabel(in);
}
- Arc readNextRealArc(Arc arc) throws IOException {
+ Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException {
// this is a continuing arc in a fixed array
- final BytesReader in;
if (arc.bytesPerArc != 0) {
// arcs are at fixed entries
arc.arcIdx++;
assert arc.arcIdx < arc.numArcs;
- in = getBytesReader(arc.posArcsStart - arc.arcIdx*arc.bytesPerArc);
+ in.pos = arc.posArcsStart - arc.arcIdx*arc.bytesPerArc;
} else {
// arcs are packed
- in = getBytesReader(arc.nextArc);
+ in.pos = arc.nextArc;
}
arc.flags = in.readByte();
arc.label = readLabel(in);
@@ -701,7 +741,18 @@ public class FST {
/** Finds an arc leaving the incoming arc, replacing the arc in place.
* This returns null if the arc was not found, else the incoming arc. */
public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc) throws IOException {
-
+ assert cachedRootArcs != null;
+ // Short-circuit if this arc is in the root arc cache:
+ if (follow.target == startNode && labelToMatch != END_LABEL && labelToMatch < cachedRootArcs.length) {
+ final Arc result = cachedRootArcs[labelToMatch];
+ if (result == null) {
+ return result;
+ } else {
+ arc.copyFrom(result);
+ return arc;
+ }
+ }
+
if (labelToMatch == END_LABEL) {
if (follow.isFinal()) {
if (follow.target <= 0) {
@@ -726,14 +777,18 @@ public class FST {
// reusable stuff eg BytesReader:
final BytesReader in = getBytesReader(follow.target);
+ // System.out.println("fta label=" + (char) labelToMatch);
+
if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) {
// Arcs are full array; do binary search:
arc.numArcs = in.readVInt();
- arc.bytesPerArc = in.readByte() & 0xFF;
+ //System.out.println(" bs " + arc.numArcs);
+ arc.bytesPerArc = in.readInt();
arc.posArcsStart = in.pos;
int low = 0;
int high = arc.numArcs-1;
while (low <= high) {
+ //System.out.println(" cycle");
int mid = (low + high) >>> 1;
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
int midLabel = readLabel(in);
@@ -744,7 +799,8 @@ public class FST {
high = mid - 1;
else {
arc.arcIdx = mid-1;
- return readNextRealArc(arc);
+ //System.out.println(" found!");
+ return readNextRealArc(arc, in);
}
}
@@ -754,7 +810,12 @@ public class FST {
// Linear scan
readFirstTargetArc(follow, arc);
while(true) {
+ //System.out.println(" non-bs cycle");
+ // TODO: we should fix this code to not have to create
+ // object for the output of every arc we scan... only
+ // for the matching arc, if found
if (arc.label == labelToMatch) {
+ //System.out.println(" found!");
return arc;
} else if (arc.label > labelToMatch) {
return null;
@@ -863,7 +924,7 @@ public class FST {
}
// Non-static: reads byte[] from FST
- class BytesReader extends DataInput {
+ final class BytesReader extends DataInput {
int pos;
public BytesReader(int pos) {
diff --git a/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java b/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java
index 927c1c118ec..a6c4b66c9c1 100644
--- a/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java
+++ b/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java
@@ -170,7 +170,7 @@ abstract class FSTEnum {
if (found) {
// Match
arc.arcIdx = mid-1;
- fst.readNextRealArc(arc);
+ fst.readNextRealArc(arc, in);
assert arc.arcIdx == mid;
assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
output[upto] = fst.outputs.add(output[upto-1], arc.output);
@@ -185,7 +185,7 @@ abstract class FSTEnum {
} else if (low == arc.numArcs) {
// Dead end
arc.arcIdx = arc.numArcs-2;
- fst.readNextRealArc(arc);
+ fst.readNextRealArc(arc, in);
assert arc.isLast();
// Dead end (target is after the last arc);
// rollback to last fork then push
@@ -205,7 +205,7 @@ abstract class FSTEnum {
}
} else {
arc.arcIdx = (low > high ? low : high)-1;
- fst.readNextRealArc(arc);
+ fst.readNextRealArc(arc, in);
assert arc.label > targetLabel;
pushFirst();
return;
@@ -309,7 +309,7 @@ abstract class FSTEnum {
// Match -- recurse
//System.out.println(" match! arcIdx=" + mid);
arc.arcIdx = mid-1;
- fst.readNextRealArc(arc);
+ fst.readNextRealArc(arc, in);
assert arc.arcIdx == mid;
assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
output[upto] = fst.outputs.add(output[upto-1], arc.output);
@@ -352,7 +352,7 @@ abstract class FSTEnum {
// There is a floor arc:
arc.arcIdx = (low > high ? high : low)-1;
//System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1));
- fst.readNextRealArc(arc);
+ fst.readNextRealArc(arc, in);
assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel;
assert arc.label < targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel;
pushLast();
diff --git a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java
index a10376e7472..276aa997214 100644
--- a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java
+++ b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java
@@ -35,6 +35,7 @@ final class NodeHash {
}
private boolean nodesEqual(Builder.UnCompiledNode node, int address) throws IOException {
+ final FST.BytesReader in = fst.getBytesReader(0);
fst.readFirstRealArc(address, scratchArc);
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
return false;
@@ -56,7 +57,7 @@ final class NodeHash {
return false;
}
}
- fst.readNextRealArc(scratchArc);
+ fst.readNextRealArc(scratchArc, in);
}
return false;
@@ -87,6 +88,7 @@ final class NodeHash {
// hash code for a frozen node
private int hash(int node) throws IOException {
final int PRIME = 31;
+ final FST.BytesReader in = fst.getBytesReader(0);
//System.out.println("hash frozen");
int h = 0;
fst.readFirstRealArc(node, scratchArc);
@@ -102,7 +104,7 @@ final class NodeHash {
if (scratchArc.isLast()) {
break;
}
- fst.readNextRealArc(scratchArc);
+ fst.readNextRealArc(scratchArc, in);
}
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
return h & Integer.MAX_VALUE;
diff --git a/lucene/src/site/build/site/contributions.html b/lucene/src/site/build/site/contributions.html
index 72f4e9ca82c..1e2ecc0a4dd 100644
--- a/lucene/src/site/build/site/contributions.html
+++ b/lucene/src/site/build/site/contributions.html
@@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
diff --git a/lucene/src/site/build/site/demo.html b/lucene/src/site/build/site/demo.html
index 24251b5f2a7..90373dfe61e 100644
--- a/lucene/src/site/build/site/demo.html
+++ b/lucene/src/site/build/site/demo.html
@@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
diff --git a/lucene/src/site/build/site/demo2.html b/lucene/src/site/build/site/demo2.html
index b369c658972..0916963ef3c 100644
--- a/lucene/src/site/build/site/demo2.html
+++ b/lucene/src/site/build/site/demo2.html
@@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
diff --git a/lucene/src/site/build/site/fileformats.html b/lucene/src/site/build/site/fileformats.html
index ef91a18e36f..da02cf70a98 100644
--- a/lucene/src/site/build/site/fileformats.html
+++ b/lucene/src/site/build/site/fileformats.html
@@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
diff --git a/lucene/src/site/build/site/gettingstarted.html b/lucene/src/site/build/site/gettingstarted.html
index c83de12ffa0..a50a3581ede 100644
--- a/lucene/src/site/build/site/gettingstarted.html
+++ b/lucene/src/site/build/site/gettingstarted.html
@@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
diff --git a/lucene/src/site/build/site/index.html b/lucene/src/site/build/site/index.html
index 75ca1fcc910..bd258d4fd1d 100644
--- a/lucene/src/site/build/site/index.html
+++ b/lucene/src/site/build/site/index.html
@@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
diff --git a/lucene/src/site/build/site/linkmap.html b/lucene/src/site/build/site/linkmap.html
index cb546159dbf..c4b46090f85 100644
--- a/lucene/src/site/build/site/linkmap.html
+++ b/lucene/src/site/build/site/linkmap.html
@@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
@@ -358,12 +355,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker ___________________ javadoc-contrib-spellchecker
-
-
--
-Wordnet ___________________ javadoc-contrib-wordnet
-
-
-
diff --git a/lucene/src/site/build/site/lucene-contrib/index.html b/lucene/src/site/build/site/lucene-contrib/index.html
index 3d34f87c119..6f511ac85fb 100644
--- a/lucene/src/site/build/site/lucene-contrib/index.html
+++ b/lucene/src/site/build/site/lucene-contrib/index.html
@@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
@@ -263,9 +260,6 @@ document.write("Last Published: " + document.lastModified);
spellchecker
-
-wordnet
-
--
xml-query-parser
@@ -375,12 +369,7 @@ document.write("Last Published: " + document.lastModified);
Provides tools for spellchecking and suggestions with Lucene.
See spellchecker javadoc
-
-wordnet
-Tools to help utilize wordnet synonyms with Lucene
-See wordnet javadoc
-
-
+
xml-query-parser
A QueryParser that can read queries written in an XML format.
See xml-query-parser javadoc
diff --git a/lucene/src/site/build/site/queryparsersyntax.html b/lucene/src/site/build/site/queryparsersyntax.html
index f2c9d6929e6..ba748aa2262 100644
--- a/lucene/src/site/build/site/queryparsersyntax.html
+++ b/lucene/src/site/build/site/queryparsersyntax.html
@@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
diff --git a/lucene/src/site/build/site/scoring.html b/lucene/src/site/build/site/scoring.html
index daba6794660..4fe632a7665 100644
--- a/lucene/src/site/build/site/scoring.html
+++ b/lucene/src/site/build/site/scoring.html
@@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
diff --git a/lucene/src/site/build/site/systemrequirements.html b/lucene/src/site/build/site/systemrequirements.html
index 668f89d3503..94e98b7a3fd 100644
--- a/lucene/src/site/build/site/systemrequirements.html
+++ b/lucene/src/site/build/site/systemrequirements.html
@@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
Spellchecker
-
diff --git a/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml b/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml
index 8d156ccee7e..749364202c1 100644
--- a/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml
+++ b/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml
@@ -106,11 +106,6 @@
See spellchecker javadoc
- wordnet
- Tools to help utilize wordnet synonyms with Lucene
- See wordnet javadoc
-
-
xml-query-parser
A QueryParser that can read queries written in an XML format.
See xml-query-parser javadoc
diff --git a/lucene/src/site/src/documentation/content/xdocs/site.xml b/lucene/src/site/src/documentation/content/xdocs/site.xml
index bf4850eee2d..224e4f347fa 100755
--- a/lucene/src/site/src/documentation/content/xdocs/site.xml
+++ b/lucene/src/site/src/documentation/content/xdocs/site.xml
@@ -66,7 +66,6 @@ See http://forrest.apache.org/docs/linking.html for more info
-
@@ -106,7 +105,6 @@ See http://forrest.apache.org/docs/linking.html for more info
-
diff --git a/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index b5abcc18551..db82596e4ac 100644
--- a/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -260,7 +260,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
default:
text = _TestUtil.randomUnicodeString(random, maxWordLength);
}
-
+
+ if (VERBOSE) {
+ System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
+ }
+
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text));
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
@@ -286,6 +290,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
ts.close();
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
if (!tokens.isEmpty()) {
+ if (VERBOSE) {
+ System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
+ }
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertAnalyzesToReuse(a, text,
diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterCommit.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterCommit.java
index 563d1c13d24..e6244e915a5 100644
--- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterCommit.java
+++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterCommit.java
@@ -31,6 +31,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
@@ -166,6 +167,13 @@ public class TestIndexWriterCommit extends LuceneTestCase {
* measure max temp disk space used.
*/
public void testCommitOnCloseDiskUsage() throws IOException {
+ // MemoryCodec, since it uses FST, is not necessarily
+ // "additive", ie if you add up N small FSTs, then merge
+ // them, the merged result can easily be larger than the
+ // sum because the merged FST may use array encoding for
+ // some arcs (which uses more space):
+ assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("id").equals("Memory"));
+ assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("content").equals("Memory"));
MockDirectoryWrapper dir = newDirectory();
Analyzer analyzer;
if (random.nextBoolean()) {
diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java
index 50febbd5906..5c8f0d58e0a 100644
--- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java
+++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
@@ -142,6 +143,14 @@ public class TestIndexWriterOnDiskFull extends LuceneTestCase {
*/
public void testAddIndexOnDiskFull() throws IOException
{
+ // MemoryCodec, since it uses FST, is not necessarily
+ // "additive", ie if you add up N small FSTs, then merge
+ // them, the merged result can easily be larger than the
+ // sum because the merged FST may use array encoding for
+ // some arcs (which uses more space):
+ assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("id").equals("Memory"));
+ assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("content").equals("Memory"));
+
int START_COUNT = 57;
int NUM_DIR = TEST_NIGHTLY ? 50 : 5;
int END_COUNT = START_COUNT + NUM_DIR* (TEST_NIGHTLY ? 25 : 5);
diff --git a/lucene/src/test/org/apache/lucene/util/TestCharsRef.java b/lucene/src/test/org/apache/lucene/util/TestCharsRef.java
new file mode 100644
index 00000000000..1852028378d
--- /dev/null
+++ b/lucene/src/test/org/apache/lucene/util/TestCharsRef.java
@@ -0,0 +1,41 @@
+package org.apache.lucene.util;
+
+import java.util.Arrays;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestCharsRef extends LuceneTestCase {
+ public void testUTF16InUTF8Order() {
+ final int numStrings = atLeast(1000);
+ BytesRef utf8[] = new BytesRef[numStrings];
+ CharsRef utf16[] = new CharsRef[numStrings];
+
+ for (int i = 0; i < numStrings; i++) {
+ String s = _TestUtil.randomUnicodeString(random);
+ utf8[i] = new BytesRef(s);
+ utf16[i] = new CharsRef(s);
+ }
+
+ Arrays.sort(utf8);
+ Arrays.sort(utf16, CharsRef.getUTF16SortedAsUTF8Comparator());
+
+ for (int i = 0; i < numStrings; i++) {
+ assertEquals(utf8[i].utf8ToString(), utf16[i].toString());
+ }
+ }
+}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java
new file mode 100644
index 00000000000..7750114e83a
--- /dev/null
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java
@@ -0,0 +1,179 @@
+package org.apache.lucene.analysis.synonym;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.io.Reader;
+import java.text.ParseException;
+import java.util.ArrayList;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.CharsRef;
+
+/**
+ * Parser for the Solr synonyms format.
+ *
+ * - Blank lines and lines starting with '#' are comments.
+ *
- Explicit mappings match any token sequence on the LHS of "=>"
+ * and replace with all alternatives on the RHS. These types of mappings
+ * ignore the expand parameter in the constructor.
+ * Example:
+ *
i-pod, i pod => ipod
+ * - Equivalent synonyms may be separated with commas and give
+ * no explicit mapping. In this case the mapping behavior will
+ * be taken from the expand parameter in the constructor. This allows
+ * the same synonym file to be used in different synonym handling strategies.
+ * Example:
+ *
ipod, i-pod, i pod
+ *
+ * - Multiple synonym mapping entries are merged.
+ * Example:
+ *
+ * foo => foo bar
+ * foo => baz
+ * is equivalent to
+ * foo => foo bar, baz
+ *
+ *
+ * @lucene.experimental
+ */
+public class SolrSynonymParser extends SynonymMap.Builder {
+ private final boolean expand;
+ private final Analyzer analyzer;
+
+ public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
+ super(dedup);
+ this.expand = expand;
+ this.analyzer = analyzer;
+ }
+
+ public void add(Reader in) throws IOException, ParseException {
+ LineNumberReader br = new LineNumberReader(in);
+ try {
+ addInternal(br);
+ } catch (IllegalArgumentException e) {
+ ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
+ ex.initCause(e);
+ throw ex;
+ } finally {
+ br.close();
+ }
+ }
+
+ private void addInternal(BufferedReader in) throws IOException {
+ String line = null;
+ while ((line = in.readLine()) != null) {
+ if (line.length() == 0 || line.charAt(0) == '#') {
+ continue; // ignore empty lines and comments
+ }
+
+ CharsRef inputs[];
+ CharsRef outputs[];
+
+ // TODO: we could process this more efficiently.
+ String sides[] = split(line, "=>");
+ if (sides.length > 1) { // explicit mapping
+ if (sides.length != 2) {
+ throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
+ }
+ String inputStrings[] = split(sides[0], ",");
+ inputs = new CharsRef[inputStrings.length];
+ for (int i = 0; i < inputs.length; i++) {
+ inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
+ }
+
+ String outputStrings[] = split(sides[1], ",");
+ outputs = new CharsRef[outputStrings.length];
+ for (int i = 0; i < outputs.length; i++) {
+ outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
+ }
+ } else {
+ String inputStrings[] = split(line, ",");
+ inputs = new CharsRef[inputStrings.length];
+ for (int i = 0; i < inputs.length; i++) {
+ inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
+ }
+ if (expand) {
+ outputs = inputs;
+ } else {
+ outputs = new CharsRef[1];
+ outputs[0] = inputs[0];
+ }
+ }
+
+ // currently we include the term itself in the map,
+ // and use includeOrig = false always.
+ // this is how the existing filter does it, but its actually a bug,
+ // especially if combined with ignoreCase = true
+ for (int i = 0; i < inputs.length; i++) {
+ for (int j = 0; j < outputs.length; j++) {
+ add(inputs[i], outputs[j], false);
+ }
+ }
+ }
+ }
+
+ private static String[] split(String s, String separator) {
+ ArrayList list = new ArrayList(2);
+ StringBuilder sb = new StringBuilder();
+ int pos=0, end=s.length();
+ while (pos < end) {
+ if (s.startsWith(separator,pos)) {
+ if (sb.length() > 0) {
+ list.add(sb.toString());
+ sb=new StringBuilder();
+ }
+ pos+=separator.length();
+ continue;
+ }
+
+ char ch = s.charAt(pos++);
+ if (ch=='\\') {
+ sb.append(ch);
+ if (pos>=end) break; // ERROR, or let it go?
+ ch = s.charAt(pos++);
+ }
+
+ sb.append(ch);
+ }
+
+ if (sb.length() > 0) {
+ list.add(sb.toString());
+ }
+
+ return list.toArray(new String[list.size()]);
+ }
+
+ private String unescape(String s) {
+ if (s.indexOf("\\") >= 0) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < s.length(); i++) {
+ char ch = s.charAt(i);
+ if (ch == '\\' && i < s.length() - 1) {
+ sb.append(s.charAt(++i));
+ } else {
+ sb.append(ch);
+ }
+ }
+ return sb.toString();
+ }
+ return s;
+ }
+}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
index 633156e3101..64827c821c4 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
@@ -1,3 +1,5 @@
+package org.apache.lucene.analysis.synonym;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -15,245 +17,550 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.synonym;
+import java.io.IOException;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.fst.FST;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.LinkedList;
+/**
+ * Matches single or multi word synonyms in a token stream.
+ * This token stream cannot properly handle position
+ * increments != 1, ie, you should place this filter before
+ * filtering out stop words.
+ *
+ * Note that with the current implementation, parsing is
+ * greedy, so whenever multiple parses would apply, the rule
+ * starting the earliest and parsing the most tokens wins.
+ * For example if you have these rules:
+ *
+ *
+ * a -> x
+ * a b -> y
+ * b c d -> z
+ *
+ *
+ * Then input a b c d e
parses to y b c
+ * d
, ie the 2nd rule "wins" because it started
+ * earliest and matched the most input tokens of other rules
+ * starting at that point.
+ *
+ * A future improvement to this filter could allow
+ * non-greedy parsing, such that the 3rd rule would win, and
+ * also separately allow multiple parses, such that all 3
+ * rules would match, perhaps even on a rule by rule
+ * basis.
+ *
+ * NOTE: when a match occurs, the output tokens
+ * associated with the matching rule are "stacked" on top of
+ * the input stream (if the rule had
+ * keepOrig=true
) and also on top of aother
+ * matched rule's output tokens. This is not a correct
+ * solution, as really the output should be an abitrary
+ * graph/lattice. For example, with the above match, you
+ * would expect an exact PhraseQuery
"y b
+ * c"
to match the parsed tokens, but it will fail to
+ * do so. This limitations is necessary because Lucene's
+ * TokenStream (and index) cannot yet represent an arbitrary
+ * graph.
+ *
+ * NOTE: If multiple incoming tokens arrive on the
+ * same position, only the first token at that position is
+ * used for parsing. Subsequent tokens simply pass through
+ * and are not parsed. A future improvement would be to
+ * allow these tokens to also be matched.
+ */
+
+// TODO: maybe we should resolve token -> wordID then run
+// FST on wordIDs, for better perf?
+
+// TODO: a more efficient approach would be Aho/Corasick's
+// algorithm
+// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
+// It improves over the current approach here
+// because it does not fully re-start matching at every
+// token. For exampl,e if one pattern is "a b c x"
+// and another is "b c d" and the input is "a b c d", on
+// trying to parse "a b c x" but failing when you got to x,
+// rather than starting over again your really should
+// immediately recognize that "b c d" matches at the next
+// input. I suspect this won't matter that much in
+// practice, but it's possible on some set of synonyms it
+// will. We'd have to modify Aho/Corasick to enforce our
+// conflict resolving (eg greedy matching) because that algo
+// finds all matches.
-/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
- *
- * The matched tokens from the input stream may be optionally passed through (includeOrig=true)
- * or discarded. If the original tokens are included, the position increments may be modified
- * to retain absolute positions after merging with the synonym tokenstream.
- *
- * Generated synonyms will start at the same position as the first matched source token.
- */
public final class SynonymFilter extends TokenFilter {
- private final SynonymMap map; // Map
- private Iterator replacement; // iterator over generated tokens
+ public static final String TYPE_SYNONYM = "SYNONYM";
- public SynonymFilter(TokenStream in, SynonymMap map) {
- super(in);
- if (map == null)
- throw new IllegalArgumentException("map is required");
+ private final SynonymMap synonyms;
- this.map = map;
- // just ensuring these attributes exist...
- addAttribute(CharTermAttribute.class);
- addAttribute(PositionIncrementAttribute.class);
- addAttribute(OffsetAttribute.class);
- addAttribute(TypeAttribute.class);
+ private final boolean ignoreCase;
+ private final int rollBufferSize;
+
+ private int captureCount;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ // How many future input tokens have already been matched
+ // to a synonym; because the matching is "greedy" we don't
+ // try to do any more matching for such tokens:
+ private int inputSkipCount;
+
+ // Hold all buffered (read ahead) stacked input tokens for
+ // a future position. When multiple tokens are at the
+ // same position, we only store (and match against) the
+ // term for the first token at the position, but capture
+ // state for (and enumerate) all other tokens at this
+ // position:
+ private static class PendingInput {
+ final CharsRef term = new CharsRef();
+ AttributeSource.State state;
+ boolean keepOrig;
+ boolean consumed = true;
+ int startOffset;
+ int endOffset;
+
+ public void reset() {
+ state = null;
+ consumed = true;
+ keepOrig = false;
+ }
+ };
+
+ // Rolling buffer, holding pending input tokens we had to
+ // clone because we needed to look ahead, indexed by
+ // position:
+ private final PendingInput[] futureInputs;
+
+ // Holds pending output synonyms for one future position:
+ private static class PendingOutputs {
+ CharsRef[] outputs;
+ int upto;
+ int count;
+ int posIncr = 1;
+
+ public PendingOutputs() {
+ outputs = new CharsRef[1];
+ }
+
+ public void reset() {
+ upto = count = 0;
+ posIncr = 1;
+ }
+
+ public CharsRef pullNext() {
+ assert upto < count;
+ final CharsRef result = outputs[upto++];
+ posIncr = 0;
+ if (upto == count) {
+ reset();
+ }
+ return result;
+ }
+
+ public void add(char[] output, int offset, int len) {
+ if (count == outputs.length) {
+ final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(outputs, 0, next, 0, count);
+ outputs = next;
+ }
+ if (outputs[count] == null) {
+ outputs[count] = new CharsRef();
+ }
+ outputs[count].copy(output, offset, len);
+ count++;
+ }
+ };
+
+ private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
+
+ // Rolling buffer, holding stack of pending synonym
+ // outputs, indexed by position:
+ private final PendingOutputs[] futureOutputs;
+
+ // Where (in rolling buffers) to write next input saved state:
+ private int nextWrite;
+
+ // Where (in rolling buffers) to read next input saved state:
+ private int nextRead;
+
+ // True once we've read last token
+ private boolean finished;
+
+ private final FST.Arc scratchArc;
+
+ private final FST fst;
+
+ private final BytesRef scratchBytes = new BytesRef();
+ private final CharsRef scratchChars = new CharsRef();
+
+ /**
+ * @param input input tokenstream
+ * @param synonyms synonym map
+ * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
+ * Note, if you set this to true, its your responsibility to lowercase
+ * the input entries when you create the {@link SynonymMap}
+ */
+ public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
+ super(input);
+ this.synonyms = synonyms;
+ this.ignoreCase = ignoreCase;
+ this.fst = synonyms.fst;
+
+ if (fst == null) {
+ throw new IllegalArgumentException("fst must be non-null");
+ }
+
+ // Must be 1+ so that when roll buffer is at full
+ // lookahead we can distinguish this full buffer from
+ // the empty buffer:
+ rollBufferSize = 1+synonyms.maxHorizontalContext;
+
+ futureInputs = new PendingInput[rollBufferSize];
+ futureOutputs = new PendingOutputs[rollBufferSize];
+ for(int pos=0;pos bar
- * - need to backtrack - retry matches for tokens already read
- * a b c d => foo
- * b c => bar
- * If the input stream is "a b c x", one will consume "a b c d"
- * trying to match the first rule... all but "a" should be
- * pushed back so a match may be made on "b c".
- * - don't try and match generated tokens (thus need separate queue)
- * matching is not recursive.
- * - handle optional generation of original tokens in all these cases,
- * merging token streams to preserve token positions.
- * - preserve original positionIncrement of first matched token
- */
- @Override
- public boolean incrementToken() throws IOException {
- while (true) {
- // if there are any generated tokens, return them... don't try any
- // matches against them, as we specifically don't want recursion.
- if (replacement!=null && replacement.hasNext()) {
- copy(this, replacement.next());
- return true;
- }
+ This is the core of this TokenFilter: it locates the
+ synonym matches and buffers up the results into
+ futureInputs/Outputs.
- // common case fast-path of first token not matching anything
- AttributeSource firstTok = nextTok();
- if (firstTok == null) return false;
- CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
- SynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
- if (result == null) {
- copy(this, firstTok);
- return true;
- }
+ NOTE: this calls input.incrementToken and does not
+ capture the state if no further tokens were checked. So
+ caller must then forward state to our caller, or capture:
+ */
- // fast-path failed, clone ourselves if needed
- if (firstTok == this)
- firstTok = cloneAttributes();
- // OK, we matched a token, so find the longest match.
+ private void parse() throws IOException {
+ //System.out.println("\nS: parse");
- matched = new LinkedList();
+ assert inputSkipCount == 0;
- result = match(result);
+ int curNextRead = nextRead;
- if (result==null) {
- // no match, simply return the first token read.
- copy(this, firstTok);
- return true;
- }
+ // Holds the longest match we've seen so far:
+ BytesRef matchOutput = null;
+ int matchInputLength = 0;
- // reuse, or create new one each time?
- ArrayList generated = new ArrayList(result.synonyms.length + matched.size() + 1);
+ BytesRef pendingOutput = fst.outputs.getNoOutput();
+ fst.getFirstArc(scratchArc);
- //
- // there was a match... let's generate the new tokens, merging
- // in the matched tokens (position increments need adjusting)
- //
- AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
- boolean includeOrig = result.includeOrig();
+ assert scratchArc.output == fst.outputs.getNoOutput();
- AttributeSource origTok = includeOrig ? firstTok : null;
- PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
- int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
- int repPos=0; // curr position in replacement token stream
- int pos=0; // current position in merged token stream
+ int tokenCount = 0;
- for (int i=0; i foo/0
- // should I re-create the gap on the next buffered token?
-
- replacement = generated.iterator();
- // Now return to the top of the loop to read and return the first
- // generated token.. The reason this is done is that we may have generated
- // nothing at all, and may need to continue with more matching logic.
- }
- }
-
-
- //
- // Defer creation of the buffer until the first time it is used to
- // optimize short fields with no matches.
- //
- private LinkedList buffer;
- private LinkedList matched;
-
- private boolean exhausted;
-
- private AttributeSource nextTok() throws IOException {
- if (buffer!=null && !buffer.isEmpty()) {
- return buffer.removeFirst();
- } else {
- if (!exhausted && input.incrementToken()) {
- return this;
} else {
- exhausted = true;
- return null;
+ // Still in our lookahead
+ buffer = futureInputs[curNextRead].term.chars;
+ bufferLen = futureInputs[curNextRead].term.length;
+ //System.out.println(" old token=" + new String(buffer, 0, bufferLen));
+ }
+
+ tokenCount++;
+
+ // Run each char in this token through the FST:
+ int bufUpto = 0;
+ while(bufUpto < bufferLen) {
+ final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
+ if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) {
+ //System.out.println(" stop");
+ break byToken;
+ }
+
+ // Accum the output
+ pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+ //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
+ bufUpto += Character.charCount(codePoint);
+ }
+
+ // OK, entire token matched; now see if this is a final
+ // state:
+ if (scratchArc.isFinal()) {
+ matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
+ matchInputLength = tokenCount;
+ //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
+ }
+
+ // See if the FST wants to continue matching (ie, needs to
+ // see the next input token):
+ if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
+ // No further rules can match here; we're done
+ // searching for matching rules starting at the
+ // current input position.
+ break;
+ } else {
+ // More matching is possible -- accum the output (if
+ // any) of the WORD_SEP arc:
+ pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+ if (nextRead == nextWrite) {
+ capture();
+ }
+ }
+
+ curNextRead = rollIncr(curNextRead);
+ }
+
+ if (nextRead == nextWrite && !finished) {
+ //System.out.println(" skip write slot=" + nextWrite);
+ nextWrite = rollIncr(nextWrite);
+ }
+
+ if (matchOutput != null) {
+ //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
+ inputSkipCount = matchInputLength;
+ addOutput(matchOutput);
+ } else if (nextRead != nextWrite) {
+ // Even though we had no match here, we set to 1
+ // because we need to skip current input token before
+ // trying to match again:
+ inputSkipCount = 1;
+ } else {
+ assert finished;
+ }
+
+ //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
+ }
+
+ // Interleaves all output tokens onto the futureOutputs:
+ private void addOutput(BytesRef bytes) {
+ bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
+
+ final int code = bytesReader.readVInt();
+ final boolean keepOrig = (code & 0x1) == 0;
+ final int count = code >>> 1;
+ //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig);
+ for(int outputIDX=0;outputIDX ords = new ArrayList();
+ }
+
+ /** Sugar: just joins the provided terms with {@link
+ * SynonymMap#WORD_SEPARATOR}. reuse and its chars
+ * must not be null. */
+ public static CharsRef join(String[] words, CharsRef reuse) {
+ int upto = 0;
+ char[] buffer = reuse.chars;
+ for(String word : words) {
+ if (upto > 0) {
+ if (upto >= buffer.length) {
+ reuse.grow(upto);
+ buffer = reuse.chars;
+ }
+ buffer[upto++] = SynonymMap.WORD_SEPARATOR;
+ }
+
+ final int wordLen = word.length();
+ final int needed = upto + wordLen;
+ if (needed > buffer.length) {
+ reuse.grow(needed);
+ buffer = reuse.chars;
+ }
+
+ word.getChars(0, wordLen, buffer, upto);
+ upto += wordLen;
+ }
+
+ return reuse;
+ }
+
+ /** Sugar: analyzes the text with the analyzer and
+ * separates by {@link SynonymMap#WORD_SEPARATOR}.
+ * reuse and its chars must not be null. */
+ public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
+ TokenStream ts = analyzer.reusableTokenStream("", new StringReader(text));
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
+ ts.reset();
+ reuse.length = 0;
+ while (ts.incrementToken()) {
+ int length = termAtt.length();
+ if (length == 0) {
+ throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
+ }
+ if (posIncAtt.getPositionIncrement() != 1) {
+ throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
+ }
+ reuse.grow(reuse.length + length + 1); /* current + word + separator */
+ int end = reuse.offset + reuse.length;
+ if (reuse.length > 0) {
+ reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
+ reuse.length++;
+ }
+ System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
+ reuse.length += length;
+ }
+ ts.end();
+ ts.close();
+ if (reuse.length == 0) {
+ throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
+ }
+ return reuse;
+ }
+
+ /** only used for asserting! */
+ private boolean hasHoles(CharsRef chars) {
+ final int end = chars.offset + chars.length;
+ for(int idx=chars.offset+1;idx 0 (got " + numInputWords + ")");
+ }
+ if (input.length <= 0) {
+ throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")");
+ }
+ if (numOutputWords <= 0) {
+ throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
+ }
+ if (output.length <= 0) {
+ throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")");
+ }
+
+ assert !hasHoles(input): "input has holes: " + input;
+ assert !hasHoles(output): "output has holes: " + output;
+
+ //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
+ final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch);
+ // lookup in hash
+ int ord = words.add(utf8Scratch, hashCode);
+ if (ord < 0) {
+ // already exists in our hash
+ ord = (-ord)-1;
+ //System.out.println(" output=" + output + " old ord=" + ord);
+ } else {
+ //System.out.println(" output=" + output + " new ord=" + ord);
+ }
+
+ MapEntry e = workingSet.get(input);
+ if (e == null) {
+ e = new MapEntry();
+ workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map
+ }
+
+ e.ords.add(ord);
+ e.includeOrig |= includeOrig;
+ maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords);
+ maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords);
+ }
+
+ private int countWords(CharsRef chars) {
+ int wordCount = 1;
+ int upto = chars.offset;
+ final int limit = chars.offset + chars.length;
+ while(upto < limit) {
+ if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) {
+ wordCount++;
+ }
+ }
+ return wordCount;
+ }
+
+ /**
+ * Add a phrase->phrase synonym mapping.
+ * Phrases are character sequences where words are
+ * separated with character zero (\u0000). Empty words
+ * (two \u0000s in a row) are not allowed in the input nor
+ * the output!
+ *
+ * @param input input phrase
+ * @param output output phrase
+ * @param includeOrig true if the original should be included
+ */
+ public void add(CharsRef input, CharsRef output, boolean includeOrig) {
+ add(input, countWords(input), output, countWords(output), includeOrig);
+ }
+
+ /**
+ * Builds an {@link SynonymMap} and returns it.
+ */
+ public SynonymMap build() throws IOException {
+ ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ // TODO: are we using the best sharing options?
+ org.apache.lucene.util.fst.Builder builder =
+ new org.apache.lucene.util.fst.Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
+
+ BytesRef scratch = new BytesRef(64);
+ ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
+
+ final Set dedupSet;
+
+ if (dedup) {
+ dedupSet = new HashSet();
+ } else {
+ dedupSet = null;
+ }
+
+ final byte[] spare = new byte[5];
+
+ Set keys = workingSet.keySet();
+ CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
+ Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
+
+ //System.out.println("fmap.build");
+ for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
+ CharsRef input = sortedKeys[keyIdx];
+ MapEntry output = workingSet.get(input);
+
+ int numEntries = output.ords.size();
+ // output size, assume the worst case
+ int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
+
+ scratch.grow(estimatedSize);
+ scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
+ assert scratch.offset == 0;
+
+ // now write our output data:
+ int count = 0;
+ for (int i = 0; i < numEntries; i++) {
+ if (dedupSet != null) {
+ // box once
+ final Integer ent = output.ords.get(i);
+ if (dedupSet.contains(ent)) {
+ continue;
+ }
+ dedupSet.add(ent);
+ }
+ scratchOutput.writeVInt(output.ords.get(i));
+ count++;
+ }
+
+ final int pos = scratchOutput.getPosition();
+ scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
+ final int pos2 = scratchOutput.getPosition();
+ final int vIntLen = pos2-pos;
+
+ // Move the count + includeOrig to the front of the byte[]:
+ System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen);
+ System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
+ System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen);
+
+ if (dedupSet != null) {
+ dedupSet.clear();
+ }
+
+ scratch.length = scratchOutput.getPosition() - scratch.offset;
+ //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
+ builder.add(input, new BytesRef(scratch));
+ }
+
+ FST fst = builder.finish();
+ return new SynonymMap(fst, words, maxHorizontalContext);
}
- List superset = currMap.synonyms==null ? replacement :
- mergeTokens(Arrays.asList(currMap.synonyms), replacement);
- currMap.synonyms = superset.toArray(new Token[superset.size()]);
- if (includeOrig) currMap.flags |= INCLUDE_ORIG;
}
-
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder("<");
- if (synonyms!=null) {
- sb.append("[");
- for (int i=0; i");
- return sb.toString();
- }
-
-
-
- /** Produces a List from a List */
- public static List makeTokens(List strings) {
- List ret = new ArrayList(strings.size());
- for (String str : strings) {
- //Token newTok = new Token(str,0,0,"SYNONYM");
- Token newTok = new Token(str, 0,0,"SYNONYM");
- ret.add(newTok);
- }
- return ret;
- }
-
-
- /**
- * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
- * the tokens end up at the same position.
- *
- * Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
- * Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
- *
- */
- public static List mergeTokens(List lst1, List lst2) {
- ArrayList result = new ArrayList();
- if (lst1 ==null || lst2 ==null) {
- if (lst2 != null) result.addAll(lst2);
- if (lst1 != null) result.addAll(lst1);
- return result;
- }
-
- int pos=0;
- Iterator iter1=lst1.iterator();
- Iterator iter2=lst2.iterator();
- Token tok1 = iter1.hasNext() ? iter1.next() : null;
- Token tok2 = iter2.hasNext() ? iter2.next() : null;
- int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0;
- int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0;
- while(tok1!=null || tok2!=null) {
- while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
- Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
- tok.copyBuffer(tok1.buffer(), 0, tok1.length());
- tok.setPositionIncrement(pos1-pos);
- result.add(tok);
- pos=pos1;
- tok1 = iter1.hasNext() ? iter1.next() : null;
- pos1 += tok1!=null ? tok1.getPositionIncrement() : 0;
- }
- while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
- Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
- tok.copyBuffer(tok2.buffer(), 0, tok2.length());
- tok.setPositionIncrement(pos2-pos);
- result.add(tok);
- pos=pos2;
- tok2 = iter2.hasNext() ? iter2.next() : null;
- pos2 += tok2!=null ? tok2.getPositionIncrement() : 0;
- }
- }
- return result;
- }
-
}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java
new file mode 100644
index 00000000000..20aeea0e362
--- /dev/null
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java
@@ -0,0 +1,112 @@
+package org.apache.lucene.analysis.synonym;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.io.Reader;
+import java.text.ParseException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.CharsRef;
+
+/**
+ * Parser for wordnet prolog format
+ *
+ * See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format.
+ * @lucene.experimental
+ */
+// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
+public class WordnetSynonymParser extends SynonymMap.Builder {
+ private final boolean expand;
+ private final Analyzer analyzer;
+
+ public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
+ super(dedup);
+ this.expand = expand;
+ this.analyzer = analyzer;
+ }
+
+ public void add(Reader in) throws IOException, ParseException {
+ LineNumberReader br = new LineNumberReader(in);
+ try {
+ String line = null;
+ String lastSynSetID = "";
+ CharsRef synset[] = new CharsRef[8];
+ int synsetSize = 0;
+
+ while ((line = br.readLine()) != null) {
+ String synSetID = line.substring(2, 11);
+
+ if (!synSetID.equals(lastSynSetID)) {
+ addInternal(synset, synsetSize);
+ synsetSize = 0;
+ }
+
+ if (synset.length <= synsetSize+1) {
+ CharsRef larger[] = new CharsRef[synset.length * 2];
+ System.arraycopy(synset, 0, larger, 0, synsetSize);
+ synset = larger;
+ }
+
+ synset[synsetSize] = parseSynonym(line, synset[synsetSize]);
+ synsetSize++;
+ lastSynSetID = synSetID;
+ }
+
+ // final synset in the file
+ addInternal(synset, synsetSize);
+ } catch (IllegalArgumentException e) {
+ ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
+ ex.initCause(e);
+ throw ex;
+ } finally {
+ br.close();
+ }
+ }
+
+ private CharsRef parseSynonym(String line, CharsRef reuse) throws IOException {
+ if (reuse == null) {
+ reuse = new CharsRef(8);
+ }
+
+ int start = line.indexOf('\'')+1;
+ int end = line.lastIndexOf('\'');
+
+ String text = line.substring(start, end).replace("''", "'");
+ return analyze(analyzer, text, reuse);
+ }
+
+ private void addInternal(CharsRef synset[], int size) throws IOException {
+ if (size <= 1) {
+ return; // nothing to do
+ }
+
+ if (expand) {
+ for (int i = 0; i < size; i++) {
+ for (int j = 0; j < size; j++) {
+ add(synset[i], synset[j], false);
+ }
+ }
+ } else {
+ for (int i = 0; i < size; i++) {
+ add(synset[i], synset[0], false);
+ }
+ }
+ }
+}
diff --git a/lucene/contrib/wordnet/src/java/overview.html b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/package.html
similarity index 84%
rename from lucene/contrib/wordnet/src/java/overview.html
rename to modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/package.html
index cd05399880b..2fd37e8de20 100644
--- a/lucene/contrib/wordnet/src/java/overview.html
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/package.html
@@ -1,3 +1,4 @@
+
-
-
-
- wordnet
-
-
-
- wordnet
-
-
\ No newline at end of file
+
+
+Analysis components for Synonyms.
+
+
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java
new file mode 100644
index 00000000000..6260a3d1618
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java
@@ -0,0 +1,144 @@
+package org.apache.lucene.analysis.synonym;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.ParseException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.junit.Test;
+
+/**
+ * Tests parser for the Solr synonyms format
+ * @lucene.experimental
+ */
+public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
+
+ /** Tests some simple examples from the solr wiki */
+ public void testSimple() throws Exception {
+ String testFile =
+ "i-pod, ipod, ipoooood\n" +
+ "foo => foo bar\n" +
+ "foo => baz\n" +
+ "this test, that testing";
+
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
+ parser.add(new StringReader(testFile));
+ final SynonymMap map = parser.build();
+
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+ }
+ };
+
+ assertAnalyzesTo(analyzer, "ball",
+ new String[] { "ball" },
+ new int[] { 1 });
+
+ assertAnalyzesTo(analyzer, "i-pod",
+ new String[] { "i-pod", "ipod", "ipoooood" },
+ new int[] { 1, 0, 0 });
+
+ assertAnalyzesTo(analyzer, "foo",
+ new String[] { "foo", "baz", "bar" },
+ new int[] { 1, 0, 1 });
+
+ assertAnalyzesTo(analyzer, "this test",
+ new String[] { "this", "that", "test", "testing" },
+ new int[] { 1, 0, 1, 0 });
+ }
+
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidDoubleMap() throws Exception {
+ String testFile = "a => b => c";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
+ parser.add(new StringReader(testFile));
+ }
+
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidAnalyzesToNothingOutput() throws Exception {
+ String testFile = "a => 1";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
+ parser.add(new StringReader(testFile));
+ }
+
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidAnalyzesToNothingInput() throws Exception {
+ String testFile = "1 => a";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
+ parser.add(new StringReader(testFile));
+ }
+
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidPositionsInput() throws Exception {
+ String testFile = "testola => the test";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
+ parser.add(new StringReader(testFile));
+ }
+
+ /** parse a syn file with bad syntax */
+ @Test(expected=ParseException.class)
+ public void testInvalidPositionsOutput() throws Exception {
+ String testFile = "the test => testola";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
+ parser.add(new StringReader(testFile));
+ }
+
+ /** parse a syn file with some escaped syntax chars */
+ public void testEscapedStuff() throws Exception {
+ String testFile =
+ "a\\=>a => b\\=>b\n" +
+ "a\\,a => b\\,b";
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.KEYWORD, false));
+ parser.add(new StringReader(testFile));
+ final SynonymMap map = parser.build();
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
+ }
+ };
+
+ assertAnalyzesTo(analyzer, "ball",
+ new String[] { "ball" },
+ new int[] { 1 });
+
+ assertAnalyzesTo(analyzer, "a=>a",
+ new String[] { "b=>b" },
+ new int[] { 1 });
+
+ assertAnalyzesTo(analyzer, "a,a",
+ new String[] { "b,b" },
+ new int[] { 1 });
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
new file mode 100644
index 00000000000..ba1b23f5c6b
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
@@ -0,0 +1,393 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util._TestUtil;
+
+public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
+
+ private SynonymMap.Builder b;
+ private Tokenizer tokensIn;
+ private SynonymFilter tokensOut;
+ private CharTermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private OffsetAttribute offsetAtt;
+
+ private void add(String input, String output, boolean keepOrig) {
+ b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
+ new CharsRef(output.replaceAll(" +", "\u0000")),
+ keepOrig);
+ }
+
+ private void assertEquals(CharTermAttribute term, String expected) {
+ assertEquals(expected.length(), term.length());
+ final char[] buffer = term.buffer();
+ for(int chIDX=0;chIDX 0) {
+ assertTrue(tokensOut.incrementToken());
+ if (VERBOSE) {
+ System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
+ }
+ }
+ assertEquals(termAtt, expectedAtPos[atPos]);
+ assertEquals(atPos == 0 ? 1 : 0,
+ posIncrAtt.getPositionIncrement());
+ // start/end offset of all tokens at same pos should
+ // be the same:
+ assertEquals(startOffset, offsetAtt.startOffset());
+ assertEquals(endOffset, offsetAtt.endOffset());
+ }
+ }
+ tokensOut.end();
+ tokensOut.close();
+ if (VERBOSE) {
+ System.out.println(" incr: END");
+ }
+ assertEquals(expectedUpto, expected.length);
+ }
+
+ public void testBasic() throws Exception {
+ b = new SynonymMap.Builder(true);
+ add("a", "foo", true);
+ add("a b", "bar fee", true);
+ add("b c", "dog collar", true);
+ add("c d", "dog harness holder extras", true);
+ add("m c e", "dog barks loudly", false);
+
+ add("e f", "foo bar", false);
+ add("e f", "baz bee", false);
+
+ add("z", "boo", false);
+ add("y", "bee", true);
+
+ tokensIn = new MockTokenizer(new StringReader("a"),
+ MockTokenizer.WHITESPACE,
+ true);
+ tokensIn.reset();
+ assertTrue(tokensIn.incrementToken());
+ assertFalse(tokensIn.incrementToken());
+ tokensIn.end();
+ tokensIn.close();
+
+ tokensOut = new SynonymFilter(tokensIn,
+ b.build(),
+ true);
+ termAtt = tokensOut.addAttribute(CharTermAttribute.class);
+ posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
+
+ verify("a b c", "a/bar b/fee c");
+
+ // syn output extends beyond input tokens
+ verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");
+
+ verify("a b a", "a/bar b/fee a/foo");
+
+ // outputs that add to one another:
+ verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");
+
+ // two outputs for same input
+ verify("e f", "foo/baz bar/bee");
+
+ // mixed keepOrig true/false:
+ verify("a m c e x", "a/foo dog barks loudly x");
+ verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x");
+ assertTrue(tokensOut.getCaptureCount() > 0);
+
+ // no captureStates when no syns matched
+ verify("p q r s t", "p q r s t");
+ assertEquals(0, tokensOut.getCaptureCount());
+
+ // no captureStates when only single-input syns, w/ no
+ // lookahead needed, matched
+ verify("p q z y t", "p q boo y/bee t");
+ assertEquals(0, tokensOut.getCaptureCount());
+ }
+
+ private String getRandomString(char start, int alphabetSize, int length) {
+ assert alphabetSize <= 26;
+ char[] s = new char[2*length];
+ for(int charIDX=0;charIDX out;
+ boolean keepOrig;
+ }
+
+ public String slowSynMatcher(String doc, List syns, int maxOutputLength) {
+ assertTrue(doc.length() % 2 == 0);
+ final int numInputs = doc.length()/2;
+ boolean[] keepOrigs = new boolean[numInputs];
+ Arrays.fill(keepOrigs, false);
+ String[] outputs = new String[numInputs + maxOutputLength];
+ OneSyn[] matches = new OneSyn[numInputs];
+ for(OneSyn syn : syns) {
+ int idx = -1;
+ while(true) {
+ idx = doc.indexOf(syn.in, 1+idx);
+ if (idx == -1) {
+ break;
+ }
+ assertTrue(idx % 2 == 0);
+ final int matchIDX = idx/2;
+ assertTrue(syn.in.length() % 2 == 1);
+ if (matches[matchIDX] == null) {
+ matches[matchIDX] = syn;
+ } else if (syn.in.length() > matches[matchIDX].in.length()) {
+ // Greedy conflict resolution: longer match wins:
+ matches[matchIDX] = syn;
+ } else {
+ assertTrue(syn.in.length() < matches[matchIDX].in.length());
+ }
+ }
+ }
+
+ // Greedy conflict resolution: if syn matches a range of inputs,
+ // it prevents other syns from matching that range
+ for(int inputIDX=0;inputIDX= numInputs && outputs[inputIDX] == null) {
+ break;
+ }
+ if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) {
+ sb.append(inputTokens[inputIDX]);
+ posHasOutput = true;
+ }
+
+ if (outputs[inputIDX] != null) {
+ if (posHasOutput) {
+ sb.append('/');
+ }
+ sb.append(outputs[inputIDX]);
+ }
+ if (inputIDX < limit-1) {
+ sb.append(' ');
+ }
+ }
+
+ return sb.toString();
+ }
+
+ public void testRandom() throws Exception {
+
+ final int alphabetSize = _TestUtil.nextInt(random, 2, 7);
+
+ final int docLen = atLeast(3000);
+ //final int docLen = 50;
+
+ final String document = getRandomString('a', alphabetSize, docLen);
+
+ if (VERBOSE) {
+ System.out.println("TEST: doc=" + document);
+ }
+
+ final int numSyn = atLeast(5);
+ //final int numSyn = 2;
+
+ final Map synMap = new HashMap();
+ final List syns = new ArrayList();
+ final boolean dedup = random.nextBoolean();
+ if (VERBOSE) {
+ System.out.println(" dedup=" + dedup);
+ }
+ b = new SynonymMap.Builder(dedup);
+ for(int synIDX=0;synIDX();
+ synMap.put(synIn, s);
+ s.keepOrig = random.nextBoolean();
+ }
+ final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim();
+ s.out.add(synOut);
+ add(synIn, synOut, s.keepOrig);
+ if (VERBOSE) {
+ System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig);
+ }
+ }
+
+ tokensIn = new MockTokenizer(new StringReader("a"),
+ MockTokenizer.WHITESPACE,
+ true);
+ tokensIn.reset();
+ assertTrue(tokensIn.incrementToken());
+ assertFalse(tokensIn.incrementToken());
+ tokensIn.end();
+ tokensIn.close();
+
+ tokensOut = new SynonymFilter(tokensIn,
+ b.build(),
+ true);
+ termAtt = tokensOut.addAttribute(CharTermAttribute.class);
+ posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
+
+ if (dedup) {
+ pruneDups(syns);
+ }
+
+ final String expected = slowSynMatcher(document, syns, 5);
+
+ if (VERBOSE) {
+ System.out.println("TEST: expected=" + expected);
+ }
+
+ verify(document, expected);
+ }
+
+ private void pruneDups(List syns) {
+ Set seen = new HashSet();
+ for(OneSyn syn : syns) {
+ int idx = 0;
+ while(idx < syn.out.size()) {
+ String out = syn.out.get(idx);
+ if (!seen.contains(out)) {
+ seen.add(out);
+ idx++;
+ } else {
+ syn.out.remove(idx);
+ }
+ }
+ seen.clear();
+ }
+ }
+
+ private String randomNonEmptyString() {
+ while(true) {
+ final String s = _TestUtil.randomUnicodeString(random).trim();
+ if (s.length() != 0 && s.indexOf('\u0000') == -1) {
+ return s;
+ }
+ }
+ }
+
+ /** simple random test, doesn't verify correctness.
+ * does verify it doesnt throw exceptions, or that the stream doesn't misbehave
+ */
+ public void testRandom2() throws Exception {
+ final int numIters = atLeast(10);
+ for (int i = 0; i < numIters; i++) {
+ b = new SynonymMap.Builder(random.nextBoolean());
+ final int numEntries = atLeast(10);
+ for (int j = 0; j < numEntries; j++) {
+ add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
+ }
+ final SynonymMap map = b.build();
+ final boolean ignoreCase = random.nextBoolean();
+
+ final Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
+ }
+ };
+
+ checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
+ }
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java
new file mode 100644
index 00000000000..6f1c6329afb
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
+ Analyzer analyzer;
+
+ String synonymsFile =
+ "s(100000001,1,'woods',n,1,0).\n" +
+ "s(100000001,2,'wood',n,1,0).\n" +
+ "s(100000001,3,'forest',n,1,0).\n" +
+ "s(100000002,1,'wolfish',n,1,0).\n" +
+ "s(100000002,2,'ravenous',n,1,0).\n" +
+ "s(100000003,1,'king',n,1,1).\n" +
+ "s(100000003,2,'baron',n,1,1).\n" +
+ "s(100000004,1,'king''s evil',n,1,1).\n" +
+ "s(100000004,2,'king''s meany',n,1,1).\n";
+
+ public void testSynonyms() throws Exception {
+ WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random));
+ parser.add(new StringReader(synonymsFile));
+ final SynonymMap map = parser.build();
+
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
+ }
+ };
+
+ /* all expansions */
+ assertAnalyzesTo(analyzer, "Lost in the woods",
+ new String[] { "Lost", "in", "the", "woods", "wood", "forest" },
+ new int[] { 0, 5, 8, 12, 12, 12 },
+ new int[] { 4, 7, 11, 17, 17, 17 },
+ new int[] { 1, 1, 1, 1, 0, 0 });
+
+ /* single quote */
+ assertAnalyzesTo(analyzer, "king",
+ new String[] { "king", "baron" });
+
+ /* multi words */
+ assertAnalyzesTo(analyzer, "king's evil",
+ new String[] { "king's", "king's", "evil", "meany" });
+ }
+}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java
index 48b5d251d85..317090863eb 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java
@@ -90,6 +90,10 @@ import org.apache.lucene.store.OutputStreamDataOutput;
*
* "alphabetically" in any of the documentation above indicates utf16 codepoint order,
* nothing else.
+ *
+ * NOTE: the FST file format is experimental and
+ * subject to suddenly change, requiring you to rebuild the
+ * FST suggest index.
*/
public class FSTLookup extends Lookup {
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index e0f2c21ea5a..653bcfa58e6 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -320,6 +320,9 @@ New Features
Optimizations
----------------------
+* LUCENE-3233: Improved memory usage, build time, and performance of
+ SynonymFilterFactory. (Mike McCandless, Robert Muir)
+
Bug Fixes
----------------------
diff --git a/solr/core/src/java/org/apache/solr/analysis/FSTSynonymFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/FSTSynonymFilterFactory.java
new file mode 100644
index 00000000000..151f5a9b623
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/analysis/FSTSynonymFilterFactory.java
@@ -0,0 +1,157 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.text.ParseException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.synonym.SolrSynonymParser;
+import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.Version;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * @deprecated (3.4) use {@link SynonymFilterFactory} instead. this is only a backwards compatibility
+ * mechanism that will be removed in Lucene 5.0
+ */
+// NOTE: rename this to "SynonymFilterFactory" and nuke that delegator in Lucene 5.0!
+@Deprecated
+final class FSTSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ private SynonymMap map;
+ private boolean ignoreCase;
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new SynonymFilter(input, map, ignoreCase);
+ }
+
+ @Override
+ public void inform(ResourceLoader loader) {
+ final boolean ignoreCase = getBoolean("ignoreCase", false);
+ this.ignoreCase = ignoreCase;
+
+ String tf = args.get("tokenizerFactory");
+
+ final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf, args);
+
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader);
+ TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer;
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+
+ String format = args.get("format");
+ try {
+ if (format == null || format.equals("solr")) {
+ // TODO: expose dedup as a parameter?
+ map = loadSolrSynonyms(loader, true, analyzer);
+ } else if (format.equals("wordnet")) {
+ map = loadWordnetSynonyms(loader, true, analyzer);
+ } else {
+ // TODO: somehow make this more pluggable
+ throw new RuntimeException("Unrecognized synonyms format: " + format);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Load synonyms from the solr format, "format=solr".
+ */
+ private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
+ final boolean expand = getBoolean("expand", true);
+ String synonyms = args.get("synonyms");
+ if (synonyms == null)
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
+
+ CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+
+ SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer);
+ File synonymFile = new File(synonyms);
+ if (synonymFile.exists()) {
+ decoder.reset();
+ parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
+ } else {
+ List files = StrUtils.splitFileNames(synonyms);
+ for (String file : files) {
+ decoder.reset();
+ parser.add(new InputStreamReader(loader.openResource(file), decoder));
+ }
+ }
+ return parser.build();
+ }
+
+ /**
+ * Load synonyms from the wordnet format, "format=wordnet".
+ */
+ private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
+ final boolean expand = getBoolean("expand", true);
+ String synonyms = args.get("synonyms");
+ if (synonyms == null)
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
+
+ CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+
+ WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer);
+ File synonymFile = new File(synonyms);
+ if (synonymFile.exists()) {
+ decoder.reset();
+ parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
+ } else {
+ List files = StrUtils.splitFileNames(synonyms);
+ for (String file : files) {
+ decoder.reset();
+ parser.add(new InputStreamReader(loader.openResource(file), decoder));
+ }
+ }
+ return parser.build();
+ }
+
+ private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){
+ TokenizerFactory tokFactory = (TokenizerFactory) loader.newInstance(cname);
+ tokFactory.init(args);
+ return tokFactory;
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilter.java b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilter.java
new file mode 100644
index 00000000000..d97cacda7b6
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilter.java
@@ -0,0 +1,261 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedList;
+
+/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
+ *
+ * The matched tokens from the input stream may be optionally passed through (includeOrig=true)
+ * or discarded. If the original tokens are included, the position increments may be modified
+ * to retain absolute positions after merging with the synonym tokenstream.
+ *
+ * Generated synonyms will start at the same position as the first matched source token.
+ * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
+ */
+@Deprecated
+final class SlowSynonymFilter extends TokenFilter {
+
+ private final SlowSynonymMap map; // Map
+ private Iterator replacement; // iterator over generated tokens
+
+ public SlowSynonymFilter(TokenStream in, SlowSynonymMap map) {
+ super(in);
+ if (map == null)
+ throw new IllegalArgumentException("map is required");
+
+ this.map = map;
+ // just ensuring these attributes exist...
+ addAttribute(CharTermAttribute.class);
+ addAttribute(PositionIncrementAttribute.class);
+ addAttribute(OffsetAttribute.class);
+ addAttribute(TypeAttribute.class);
+ }
+
+
+ /*
+ * Need to worry about multiple scenarios:
+ * - need to go for the longest match
+ * a b => foo #shouldn't match if "a b" is followed by "c d"
+ * a b c d => bar
+ * - need to backtrack - retry matches for tokens already read
+ * a b c d => foo
+ * b c => bar
+ * If the input stream is "a b c x", one will consume "a b c d"
+ * trying to match the first rule... all but "a" should be
+ * pushed back so a match may be made on "b c".
+ * - don't try and match generated tokens (thus need separate queue)
+ * matching is not recursive.
+ * - handle optional generation of original tokens in all these cases,
+ * merging token streams to preserve token positions.
+ * - preserve original positionIncrement of first matched token
+ */
+ @Override
+ public boolean incrementToken() throws IOException {
+ while (true) {
+ // if there are any generated tokens, return them... don't try any
+ // matches against them, as we specifically don't want recursion.
+ if (replacement!=null && replacement.hasNext()) {
+ copy(this, replacement.next());
+ return true;
+ }
+
+ // common case fast-path of first token not matching anything
+ AttributeSource firstTok = nextTok();
+ if (firstTok == null) return false;
+ CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
+ SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
+ if (result == null) {
+ copy(this, firstTok);
+ return true;
+ }
+
+ // fast-path failed, clone ourselves if needed
+ if (firstTok == this)
+ firstTok = cloneAttributes();
+ // OK, we matched a token, so find the longest match.
+
+ matched = new LinkedList();
+
+ result = match(result);
+
+ if (result==null) {
+ // no match, simply return the first token read.
+ copy(this, firstTok);
+ return true;
+ }
+
+ // reuse, or create new one each time?
+ ArrayList generated = new ArrayList(result.synonyms.length + matched.size() + 1);
+
+ //
+ // there was a match... let's generate the new tokens, merging
+ // in the matched tokens (position increments need adjusting)
+ //
+ AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
+ boolean includeOrig = result.includeOrig();
+
+ AttributeSource origTok = includeOrig ? firstTok : null;
+ PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
+ int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
+ int repPos=0; // curr position in replacement token stream
+ int pos=0; // current position in merged token stream
+
+ for (int i=0; i foo/0
+ // should I re-create the gap on the next buffered token?
+
+ replacement = generated.iterator();
+ // Now return to the top of the loop to read and return the first
+ // generated token.. The reason this is done is that we may have generated
+ // nothing at all, and may need to continue with more matching logic.
+ }
+ }
+
+
+ //
+ // Defer creation of the buffer until the first time it is used to
+ // optimize short fields with no matches.
+ //
+ private LinkedList buffer;
+ private LinkedList matched;
+
+ private boolean exhausted;
+
+ private AttributeSource nextTok() throws IOException {
+ if (buffer!=null && !buffer.isEmpty()) {
+ return buffer.removeFirst();
+ } else {
+ if (!exhausted && input.incrementToken()) {
+ return this;
+ } else {
+ exhausted = true;
+ return null;
+ }
+ }
+ }
+
+ private void pushTok(AttributeSource t) {
+ if (buffer==null) buffer=new LinkedList();
+ buffer.addFirst(t);
+ }
+
+ private SlowSynonymMap match(SlowSynonymMap map) throws IOException {
+ SlowSynonymMap result = null;
+
+ if (map.submap != null) {
+ AttributeSource tok = nextTok();
+ if (tok != null) {
+ // clone ourselves.
+ if (tok == this)
+ tok = cloneAttributes();
+ // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
+ CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
+ SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
+
+ if (subMap != null) {
+ // recurse
+ result = match(subMap);
+ }
+
+ if (result != null) {
+ matched.addFirst(tok);
+ } else {
+ // push back unmatched token
+ pushTok(tok);
+ }
+ }
+ }
+
+ // if no longer sequence matched, so if this node has synonyms, it's the match.
+ if (result==null && map.synonyms!=null) {
+ result = map;
+ }
+
+ return result;
+ }
+
+ private void copy(AttributeSource target, AttributeSource source) {
+ if (target != source)
+ source.copyTo(target);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ replacement = null;
+ exhausted = false;
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilterFactory.java
new file mode 100644
index 00000000000..3390d0d53c0
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilterFactory.java
@@ -0,0 +1,188 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Factory for {@link SlowSynonymFilter} (only used with luceneMatchVersion < 3.4)
+ *
+ * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
+ * expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
+ * </analyzer>
+ * </fieldType>
+ * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
+ */
+@Deprecated
+final class SlowSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+
+ public void inform(ResourceLoader loader) {
+ String synonyms = args.get("synonyms");
+ if (synonyms == null)
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
+ boolean ignoreCase = getBoolean("ignoreCase", false);
+ boolean expand = getBoolean("expand", true);
+
+ String tf = args.get("tokenizerFactory");
+ TokenizerFactory tokFactory = null;
+ if( tf != null ){
+ tokFactory = loadTokenizerFactory( loader, tf, args );
+ }
+
+ Iterable wlist=loadRules( synonyms, loader );
+
+ synMap = new SlowSynonymMap(ignoreCase);
+ parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
+ }
+
+ /**
+ * @return a list of all rules
+ */
+ protected Iterable loadRules( String synonyms, ResourceLoader loader ) {
+ List wlist=null;
+ try {
+ File synonymFile = new File(synonyms);
+ if (synonymFile.exists()) {
+ wlist = loader.getLines(synonyms);
+ } else {
+ List files = StrUtils.splitFileNames(synonyms);
+ wlist = new ArrayList();
+ for (String file : files) {
+ List lines = loader.getLines(file.trim());
+ wlist.addAll(lines);
+ }
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return wlist;
+ }
+
+ private SlowSynonymMap synMap;
+
+ static void parseRules(Iterable rules, SlowSynonymMap map, String mappingSep,
+ String synSep, boolean expansion, TokenizerFactory tokFactory) {
+ int count=0;
+ for (String rule : rules) {
+ // To use regexes, we need an expression that specifies an odd number of chars.
+ // This can't really be done with string.split(), and since we need to
+ // do unescaping at some point anyway, we wouldn't be saving any effort
+ // by using regexes.
+
+ List mapping = StrUtils.splitSmart(rule, mappingSep, false);
+
+ List> source;
+ List> target;
+
+ if (mapping.size() > 2) {
+ throw new RuntimeException("Invalid Synonym Rule:" + rule);
+ } else if (mapping.size()==2) {
+ source = getSynList(mapping.get(0), synSep, tokFactory);
+ target = getSynList(mapping.get(1), synSep, tokFactory);
+ } else {
+ source = getSynList(mapping.get(0), synSep, tokFactory);
+ if (expansion) {
+ // expand to all arguments
+ target = source;
+ } else {
+ // reduce to first argument
+ target = new ArrayList>(1);
+ target.add(source.get(0));
+ }
+ }
+
+ boolean includeOrig=false;
+ for (List fromToks : source) {
+ count++;
+ for (List toToks : target) {
+ map.add(fromToks,
+ SlowSynonymMap.makeTokens(toToks),
+ includeOrig,
+ true
+ );
+ }
+ }
+ }
+ }
+
+ // a , b c , d e f => [[a],[b,c],[d,e,f]]
+ private static List> getSynList(String str, String separator, TokenizerFactory tokFactory) {
+ List strList = StrUtils.splitSmart(str, separator, false);
+ // now split on whitespace to get a list of token strings
+ List> synList = new ArrayList>();
+ for (String toks : strList) {
+ List tokList = tokFactory == null ?
+ StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
+ synList.add(tokList);
+ }
+ return synList;
+ }
+
+ private static List splitByTokenizer(String source, TokenizerFactory tokFactory){
+ StringReader reader = new StringReader( source );
+ TokenStream ts = loadTokenizer(tokFactory, reader);
+ List tokList = new ArrayList();
+ try {
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ while (ts.incrementToken()){
+ if( termAtt.length() > 0 )
+ tokList.add( termAtt.toString() );
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ finally{
+ reader.close();
+ }
+ return tokList;
+ }
+
+ private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){
+ TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
+ tokFactory.init( args );
+ return tokFactory;
+ }
+
+ private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
+ return tokFactory.create( reader );
+ }
+
+ public SlowSynonymMap getSynonymMap() {
+ return synMap;
+ }
+
+ public SlowSynonymFilter create(TokenStream input) {
+ return new SlowSynonymFilter(input,synMap);
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/analysis/SlowSynonymMap.java b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymMap.java
new file mode 100644
index 00000000000..21570ae4438
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymMap.java
@@ -0,0 +1,162 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.util.Version;
+
+import java.util.*;
+
+/** Mapping rules for use with {@link SlowSynonymFilter}
+ * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
+ */
+@Deprecated
+class SlowSynonymMap {
+ /** @lucene.internal */
+ public CharArrayMap submap; // recursive: Map
+ /** @lucene.internal */
+ public Token[] synonyms;
+ int flags;
+
+ static final int INCLUDE_ORIG=0x01;
+ static final int IGNORE_CASE=0x02;
+
+ public SlowSynonymMap() {}
+ public SlowSynonymMap(boolean ignoreCase) {
+ if (ignoreCase) flags |= IGNORE_CASE;
+ }
+
+ public boolean includeOrig() { return (flags & INCLUDE_ORIG) != 0; }
+ public boolean ignoreCase() { return (flags & IGNORE_CASE) != 0; }
+
+ /**
+ * @param singleMatch List, the sequence of strings to match
+ * @param replacement List the list of tokens to use on a match
+ * @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
+ * @param mergeExisting merge the replacement tokens with any other mappings that exist
+ */
+ public void add(List singleMatch, List replacement, boolean includeOrig, boolean mergeExisting) {
+ SlowSynonymMap currMap = this;
+ for (String str : singleMatch) {
+ if (currMap.submap==null) {
+ // for now hardcode at 4.0, as its what the old code did.
+ // would be nice to fix, but shouldn't store a version in each submap!!!
+ currMap.submap = new CharArrayMap(Version.LUCENE_40, 1, ignoreCase());
+ }
+
+ SlowSynonymMap map = currMap.submap.get(str);
+ if (map==null) {
+ map = new SlowSynonymMap();
+ map.flags |= flags & IGNORE_CASE;
+ currMap.submap.put(str, map);
+ }
+
+ currMap = map;
+ }
+
+ if (currMap.synonyms != null && !mergeExisting) {
+ throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
+ }
+ List superset = currMap.synonyms==null ? replacement :
+ mergeTokens(Arrays.asList(currMap.synonyms), replacement);
+ currMap.synonyms = superset.toArray(new Token[superset.size()]);
+ if (includeOrig) currMap.flags |= INCLUDE_ORIG;
+ }
+
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder("<");
+ if (synonyms!=null) {
+ sb.append("[");
+ for (int i=0; i");
+ return sb.toString();
+ }
+
+
+
+ /** Produces a List from a List */
+ public static List makeTokens(List strings) {
+ List ret = new ArrayList(strings.size());
+ for (String str : strings) {
+ //Token newTok = new Token(str,0,0,"SYNONYM");
+ Token newTok = new Token(str, 0,0,"SYNONYM");
+ ret.add(newTok);
+ }
+ return ret;
+ }
+
+
+ /**
+ * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
+ * the tokens end up at the same position.
+ *
+ * Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
+ * Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
+ *
+ */
+ public static List mergeTokens(List lst1, List lst2) {
+ ArrayList result = new ArrayList();
+ if (lst1 ==null || lst2 ==null) {
+ if (lst2 != null) result.addAll(lst2);
+ if (lst1 != null) result.addAll(lst1);
+ return result;
+ }
+
+ int pos=0;
+ Iterator iter1=lst1.iterator();
+ Iterator iter2=lst2.iterator();
+ Token tok1 = iter1.hasNext() ? iter1.next() : null;
+ Token tok2 = iter2.hasNext() ? iter2.next() : null;
+ int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0;
+ int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0;
+ while(tok1!=null || tok2!=null) {
+ while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
+ Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
+ tok.copyBuffer(tok1.buffer(), 0, tok1.length());
+ tok.setPositionIncrement(pos1-pos);
+ result.add(tok);
+ pos=pos1;
+ tok1 = iter1.hasNext() ? iter1.next() : null;
+ pos1 += tok1!=null ? tok1.getPositionIncrement() : 0;
+ }
+ while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
+ Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
+ tok.copyBuffer(tok2.buffer(), 0, tok2.length());
+ tok.setPositionIncrement(pos2-pos);
+ result.add(tok);
+ pos=pos2;
+ tok2 = iter2.hasNext() ? iter2.next() : null;
+ pos2 += tok2!=null ? tok2.getPositionIncrement() : 0;
+ }
+ }
+ return result;
+ }
+
+}
diff --git a/solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java
index 3b8e4802b7d..d95fd1855b2 100644
--- a/solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java
@@ -1,189 +1,54 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
package org.apache.solr.analysis;
+import java.util.Map;
+
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
-import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
import org.apache.solr.common.ResourceLoader;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
-import java.io.File;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
/**
* Factory for {@link SynonymFilter}.
*
* <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- * <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
- * expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
+ * format="solr" ignoreCase="false" expand="true"
+ * tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
* </analyzer>
* </fieldType>
- *
*/
public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ private BaseTokenFilterFactory delegator;
+ @Override
+ public void init(Map args) {
+ super.init(args);
+ assureMatchVersion();
+ if (luceneMatchVersion.onOrAfter(Version.LUCENE_34)) {
+ delegator = new FSTSynonymFilterFactory();
+ } else {
+ // check if you use the new optional arg "format". this makes no sense for the old one,
+ // as its wired to solr's synonyms format only.
+ if (args.containsKey("format") && !args.get("format").equals("solr")) {
+ throw new IllegalArgumentException("You must specify luceneMatchVersion >= 3.4 to use alternate synonyms formats");
+ }
+ delegator = new SlowSynonymFilterFactory();
+ }
+ delegator.init(args);
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ assert delegator != null : "init() was not called!";
+ return delegator.create(input);
+ }
+
+ @Override
public void inform(ResourceLoader loader) {
- String synonyms = args.get("synonyms");
- if (synonyms == null)
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
- boolean ignoreCase = getBoolean("ignoreCase", false);
- boolean expand = getBoolean("expand", true);
-
- String tf = args.get("tokenizerFactory");
- TokenizerFactory tokFactory = null;
- if( tf != null ){
- tokFactory = loadTokenizerFactory( loader, tf, args );
- }
-
- Iterable wlist=loadRules( synonyms, loader );
-
- synMap = new SynonymMap(ignoreCase);
- parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
- }
-
- /**
- * @return a list of all rules
- */
- protected Iterable loadRules( String synonyms, ResourceLoader loader ) {
- List wlist=null;
- try {
- File synonymFile = new File(synonyms);
- if (synonymFile.exists()) {
- wlist = loader.getLines(synonyms);
- } else {
- List files = StrUtils.splitFileNames(synonyms);
- wlist = new ArrayList();
- for (String file : files) {
- List lines = loader.getLines(file.trim());
- wlist.addAll(lines);
- }
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- return wlist;
- }
-
- private SynonymMap synMap;
-
- static void parseRules(Iterable rules, SynonymMap map, String mappingSep,
- String synSep, boolean expansion, TokenizerFactory tokFactory) {
- int count=0;
- for (String rule : rules) {
- // To use regexes, we need an expression that specifies an odd number of chars.
- // This can't really be done with string.split(), and since we need to
- // do unescaping at some point anyway, we wouldn't be saving any effort
- // by using regexes.
-
- List mapping = StrUtils.splitSmart(rule, mappingSep, false);
-
- List> source;
- List> target;
-
- if (mapping.size() > 2) {
- throw new RuntimeException("Invalid Synonym Rule:" + rule);
- } else if (mapping.size()==2) {
- source = getSynList(mapping.get(0), synSep, tokFactory);
- target = getSynList(mapping.get(1), synSep, tokFactory);
- } else {
- source = getSynList(mapping.get(0), synSep, tokFactory);
- if (expansion) {
- // expand to all arguments
- target = source;
- } else {
- // reduce to first argument
- target = new ArrayList>(1);
- target.add(source.get(0));
- }
- }
-
- boolean includeOrig=false;
- for (List fromToks : source) {
- count++;
- for (List toToks : target) {
- map.add(fromToks,
- SynonymMap.makeTokens(toToks),
- includeOrig,
- true
- );
- }
- }
- }
- }
-
- // a , b c , d e f => [[a],[b,c],[d,e,f]]
- private static List> getSynList(String str, String separator, TokenizerFactory tokFactory) {
- List strList = StrUtils.splitSmart(str, separator, false);
- // now split on whitespace to get a list of token strings
- List> synList = new ArrayList>();
- for (String toks : strList) {
- List tokList = tokFactory == null ?
- StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
- synList.add(tokList);
- }
- return synList;
- }
-
- private static List splitByTokenizer(String source, TokenizerFactory tokFactory){
- StringReader reader = new StringReader( source );
- TokenStream ts = loadTokenizer(tokFactory, reader);
- List tokList = new ArrayList();
- try {
- CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
- while (ts.incrementToken()){
- if( termAtt.length() > 0 )
- tokList.add( termAtt.toString() );
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- finally{
- reader.close();
- }
- return tokList;
- }
-
- private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){
- TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
- tokFactory.init( args );
- return tokFactory;
- }
-
- private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
- return tokFactory.create( reader );
- }
-
- public SynonymMap getSynonymMap() {
- return synMap;
- }
-
- public SynonymFilter create(TokenStream input) {
- return new SynonymFilter(input,synMap);
+ assert delegator != null : "init() was not called!";
+ ((ResourceLoaderAware) delegator).inform(loader);
}
}
diff --git a/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java b/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java
index f0dd0782567..6afda9bed98 100644
--- a/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java
@@ -17,30 +17,69 @@
package org.apache.solr.analysis;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.lucene.analysis.synonym.SynonymFilter;
-import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.junit.Test;
+import org.apache.solr.common.ResourceLoader;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
/**
* @since solr 1.4
*/
public class TestMultiWordSynonyms extends BaseTokenTestCase {
- @Test
- public void testMultiWordSynonyms() throws IOException {
+ /**
+ * @deprecated Remove this test in 5.0
+ */
+ @Deprecated
+ public void testMultiWordSynonymsOld() throws IOException {
List rules = new ArrayList();
rules.add("a b c,d");
- SynonymMap synMap = new SynonymMap(true);
- SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
+ SlowSynonymMap synMap = new SlowSynonymMap(true);
+ SlowSynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
- SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
+ SlowSynonymFilter ts = new SlowSynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
// This fails because ["e","e"] is the value of the token stream
assertTokenStreamContents(ts, new String[] { "a", "e" });
}
+
+ public void testMultiWordSynonyms() throws IOException {
+ SynonymFilterFactory factory = new SynonymFilterFactory();
+ Map args = new HashMap();
+ args.putAll(DEFAULT_VERSION_PARAM);
+ args.put("synonyms", "synonyms.txt");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader("a b c,d"));
+ TokenStream ts = factory.create(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false));
+ // This fails because ["e","e"] is the value of the token stream
+ assertTokenStreamContents(ts, new String[] { "a", "e" });
+ }
+
+ private class StringMockSolrResourceLoader implements ResourceLoader {
+ String text;
+
+ StringMockSolrResourceLoader(String text) {
+ this.text = text;
+ }
+
+ public List getLines(String resource) throws IOException {
+ return null;
+ }
+
+ public Object newInstance(String cname, String... subpackages) {
+ return null;
+ }
+
+ public InputStream openResource(String resource) throws IOException {
+ return new ByteArrayInputStream(text.getBytes("UTF-8"));
+ }
+ }
}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java b/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java
similarity index 92%
rename from modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java
rename to solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java
index 82c2e1ce6ae..740ad33b17f 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.synonym;
+package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
@@ -29,51 +29,52 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
/**
+ * @deprecated Remove this test in Lucene 5.0
*/
-public class TestSynonymFilter extends BaseTokenStreamTestCase {
+@Deprecated
+public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
static List strings(String str) {
String[] arr = str.split(" ");
return Arrays.asList(arr);
}
- static void assertTokenizesTo(SynonymMap dict, String input,
+ static void assertTokenizesTo(SlowSynonymMap dict, String input,
String expected[]) throws IOException {
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
- SynonymFilter stream = new SynonymFilter(tokenizer, dict);
+ SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected);
}
- static void assertTokenizesTo(SynonymMap dict, String input,
+ static void assertTokenizesTo(SlowSynonymMap dict, String input,
String expected[], int posIncs[]) throws IOException {
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
- SynonymFilter stream = new SynonymFilter(tokenizer, dict);
+ SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
- static void assertTokenizesTo(SynonymMap dict, List input,
+ static void assertTokenizesTo(SlowSynonymMap dict, List input,
String expected[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
- SynonymFilter stream = new SynonymFilter(tokenizer, dict);
+ SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
- static void assertTokenizesTo(SynonymMap dict, List input,
+ static void assertTokenizesTo(SlowSynonymMap dict, List input,
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
- SynonymFilter stream = new SynonymFilter(tokenizer, dict);
+ SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
posIncs);
}
public void testMatching() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@@ -110,7 +111,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
}
public void testIncludeOrig() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = true;
boolean merge = true;
@@ -167,7 +168,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testMapMerge() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@@ -206,7 +207,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testOverlap() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@@ -229,7 +230,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
}
public void testPositionIncrements() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@@ -264,7 +265,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testPositionIncrementsWithOrig() throws IOException {
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = true;
boolean merge = true;
@@ -304,7 +305,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
// x=>y
// analysing "a x" causes "y" to have a bad offset (end less than start)
// SOLR-167
- SynonymMap map = new SynonymMap();
+ SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
diff --git a/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java b/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java
new file mode 100644
index 00000000000..24b4ef505a9
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java
@@ -0,0 +1,62 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.util.Version;
+import org.apache.solr.core.SolrResourceLoader;
+
+public class TestSynonymFilterFactory extends BaseTokenTestCase {
+ /** test that we can parse and use the solr syn file */
+ public void testSynonyms() throws Exception {
+ SynonymFilterFactory factory = new SynonymFilterFactory();
+ Map args = new HashMap();
+ args.putAll(DEFAULT_VERSION_PARAM);
+ args.put("synonyms", "synonyms.txt");
+ factory.init(args);
+ factory.inform(new SolrResourceLoader(null, null));
+ TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
+ assertTrue(ts instanceof SynonymFilter);
+ assertTokenStreamContents(ts,
+ new String[] { "GB", "gib", "gigabyte", "gigabytes" },
+ new int[] { 1, 0, 0, 0 });
+ }
+
+ /** test that we can parse and use the solr syn file, with the old impl
+ * @deprecated Remove this test in Lucene 5.0 */
+ @Deprecated
+ public void testSynonymsOld() throws Exception {
+ SynonymFilterFactory factory = new SynonymFilterFactory();
+ Map args = new HashMap();
+ args.put("luceneMatchVersion", Version.LUCENE_33.toString());
+ args.put("synonyms", "synonyms.txt");
+ factory.init(args);
+ factory.inform(new SolrResourceLoader(null, null));
+ TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
+ assertTrue(ts instanceof SlowSynonymFilter);
+ assertTokenStreamContents(ts,
+ new String[] { "GB", "gib", "gigabyte", "gigabytes" },
+ new int[] { 1, 0, 0, 0 });
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/analysis/TestSynonymMap.java b/solr/core/src/test/org/apache/solr/analysis/TestSynonymMap.java
index d3a6ee77873..66b3a5c7743 100644
--- a/solr/core/src/test/org/apache/solr/analysis/TestSynonymMap.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestSynonymMap.java
@@ -25,32 +25,35 @@ import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.common.ResourceLoader;
+/**
+ * @deprecated Remove this test in Lucene 5.0
+ */
+@Deprecated
public class TestSynonymMap extends LuceneTestCase {
public void testInvalidMappingRules() throws Exception {
- SynonymMap synMap = new SynonymMap( true );
+ SlowSynonymMap synMap = new SlowSynonymMap( true );
List rules = new ArrayList( 1 );
rules.add( "a=>b=>c" );
try{
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
fail( "RuntimeException must be thrown." );
}
catch( RuntimeException expected ){}
}
public void testReadMappingRules() throws Exception {
- SynonymMap synMap;
+ SlowSynonymMap synMap;
// (a)->[b]
List rules = new ArrayList();
rules.add( "a=>b" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "b" );
@@ -58,8 +61,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (b)->[c]
rules.clear();
rules.add( "a,b=>c" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "c" );
assertTokIncludes( synMap, "b", "c" );
@@ -67,8 +70,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (a)->[b][c]
rules.clear();
rules.add( "a=>b,c" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "b" );
assertTokIncludes( synMap, "a", "c" );
@@ -78,8 +81,8 @@ public class TestSynonymMap extends LuceneTestCase {
rules.clear();
rules.add( "a=>a1" );
rules.add( "a b=>a2" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a1" );
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
@@ -92,8 +95,8 @@ public class TestSynonymMap extends LuceneTestCase {
rules.add( "a=>a1" );
rules.add( "a b=>a2" );
rules.add( "a c=>a3" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a1" );
assertEquals( 2, getSubSynonymMap( synMap, "a" ).submap.size() );
@@ -109,8 +112,8 @@ public class TestSynonymMap extends LuceneTestCase {
rules.add( "a b=>a2" );
rules.add( "b=>b1" );
rules.add( "b c=>b2" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a1" );
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
@@ -121,14 +124,14 @@ public class TestSynonymMap extends LuceneTestCase {
}
public void testRead1waySynonymRules() throws Exception {
- SynonymMap synMap;
+ SlowSynonymMap synMap;
// (a)->[a]
// (b)->[a]
List rules = new ArrayList();
rules.add( "a,b" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "b", "a" );
@@ -138,8 +141,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (c)->[a]
rules.clear();
rules.add( "a,b,c" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 3, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "b", "a" );
@@ -149,8 +152,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (b1)->(b2)->[a]
rules.clear();
rules.add( "a,b1 b2" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() );
@@ -160,8 +163,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (b)->[a1][a2]
rules.clear();
rules.add( "a1 a2,b" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 2, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
@@ -171,14 +174,14 @@ public class TestSynonymMap extends LuceneTestCase {
}
public void testRead2waySynonymRules() throws Exception {
- SynonymMap synMap;
+ SlowSynonymMap synMap;
// (a)->[a][b]
// (b)->[a][b]
List rules = new ArrayList();
rules.add( "a,b" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b" );
@@ -190,8 +193,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (c)->[a][b][c]
rules.clear();
rules.add( "a,b,c" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 3, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b" );
@@ -209,8 +212,8 @@ public class TestSynonymMap extends LuceneTestCase {
// [b1][b2]
rules.clear();
rules.add( "a,b1 b2" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b1" );
@@ -226,8 +229,8 @@ public class TestSynonymMap extends LuceneTestCase {
// [b]
rules.clear();
rules.add( "a1 a2,b" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
@@ -239,7 +242,7 @@ public class TestSynonymMap extends LuceneTestCase {
}
public void testBigramTokenizer() throws Exception {
- SynonymMap synMap;
+ SlowSynonymMap synMap;
// prepare bi-gram tokenizer factory
BaseTokenizerFactory tf = new NGramTokenizerFactory();
@@ -251,8 +254,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (ab)->(bc)->(cd)->[ef][fg][gh]
List rules = new ArrayList();
rules.add( "abcd=>efgh" );
- synMap = new SynonymMap( true );
- SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
+ synMap = new SlowSynonymMap( true );
+ SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
assertEquals( 1, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
@@ -265,7 +268,7 @@ public class TestSynonymMap extends LuceneTestCase {
public void testLoadRules() throws Exception {
Map args = new HashMap();
args.put( "synonyms", "something.txt" );
- SynonymFilterFactory ff = new SynonymFilterFactory();
+ SlowSynonymFilterFactory ff = new SlowSynonymFilterFactory();
ff.init(args);
ff.inform( new ResourceLoader() {
@Override
@@ -289,7 +292,7 @@ public class TestSynonymMap extends LuceneTestCase {
}
});
- SynonymMap synMap = ff.getSynonymMap();
+ SlowSynonymMap synMap = ff.getSynonymMap();
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b" );
@@ -298,7 +301,7 @@ public class TestSynonymMap extends LuceneTestCase {
}
- private void assertTokIncludes( SynonymMap map, String src, String exp ) throws Exception {
+ private void assertTokIncludes( SlowSynonymMap map, String src, String exp ) throws Exception {
Token[] tokens = map.submap.get( src ).synonyms;
boolean inc = false;
for( Token token : tokens ){
@@ -308,7 +311,7 @@ public class TestSynonymMap extends LuceneTestCase {
assertTrue( inc );
}
- private SynonymMap getSubSynonymMap( SynonymMap map, String src ){
+ private SlowSynonymMap getSubSynonymMap( SlowSynonymMap map, String src ){
return map.submap.get( src );
}
}