diff --git a/lucene/build.xml b/lucene/build.xml index 7a70a63f8c7..ab98e309df2 100644 --- a/lucene/build.xml +++ b/lucene/build.xml @@ -230,7 +230,6 @@ - @@ -250,7 +249,6 @@ - diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index cd72a9865a3..d636a0dfca7 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -5,11 +5,6 @@ http://s.apache.org/luceneversions ======================= Trunk (not yet released) ======================= -Changes in runtime behavior - - * LUCENE-3250: Wordnet's SynExpand requires a non-null Analyzer (it no longer - treats null as StandardAnalyzer). (Robert Muir) - Build * LUCENE-2845: Moved contrib/benchmark to modules. @@ -78,6 +73,10 @@ New Features documents must be indexed as a document block, using IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless) + * LUCENE-3233: Added SynonymFilter for applying multi-word synonyms + during indexing or querying (with parsers for wordnet and solr formats). + Removed contrib/wordnet. (Robert Muir, Mike McCandless) + API Changes Bug Fixes diff --git a/lucene/contrib/wordnet/README.txt b/lucene/contrib/wordnet/README.txt deleted file mode 100644 index 55266d823cc..00000000000 --- a/lucene/contrib/wordnet/README.txt +++ /dev/null @@ -1,5 +0,0 @@ -As of 2002-11-13 WordNet Lucene contribution contains a single Java class: - org.apache.lucene.wordnet.Syns2Index. - -This class creates a Lucene index with synonyms for English words from -a Prolog file, which is a part of WordNet database. diff --git a/lucene/contrib/wordnet/build.xml b/lucene/contrib/wordnet/build.xml deleted file mode 100644 index 3e0e096e6af..00000000000 --- a/lucene/contrib/wordnet/build.xml +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - WordNet - - - - - - - - - - - - Index already exists - must remove first. - - - - - - - - - - - - - - - - - Index does not exist. - - - - Must specify 'word' property. - - - - - - - - - - - - - - diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java deleted file mode 100755 index a7bd81270fd..00000000000 --- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java +++ /dev/null @@ -1,142 +0,0 @@ -package org.apache.lucene.wordnet; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.TermQuery; - - -/** - * Expand a query by looking up synonyms for every term. - * You need to invoke {@link Syns2Index} first to build the synonym index. - * - * @see Syns2Index - */ -public final class SynExpand { - - /** - * Perform synonym expansion on a query. - * - * @param query users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser. - * - * @param syns a opened to the Lucene index you previously created with {@link Syns2Index}. The searcher is not closed or otherwise altered. - * - * @param a analyzer used to parse the users query. - * - * @param f optional field name to search in or null if you want the default of "contents" - * - * @param boost optional boost applied to synonyms else no boost is applied - * - * @return the expanded Query - */ - public static Query expand( String query, - IndexSearcher syns, - Analyzer a, - String f, - final float boost) - throws IOException - { - final Set already = new HashSet(); // avoid dups - List top = new LinkedList(); // needs to be separately listed.. - final String field = ( f == null) ? "contents" : f; - - // [1] Parse query into separate words so that when we expand we can avoid dups - TokenStream ts = a.reusableTokenStream( field, new StringReader( query)); - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - ts.reset(); - while (ts.incrementToken()) { - String word = termAtt.toString(); - if ( already.add( word)) - top.add( word); - } - ts.end(); - ts.close(); - final BooleanQuery tmp = new BooleanQuery(); - - // [2] form query - Iterator it = top.iterator(); - while ( it.hasNext()) - { - // [2a] add to level words in - String word = it.next(); - TermQuery tq = new TermQuery( new Term( field, word)); - tmp.add( tq, BooleanClause.Occur.SHOULD); - - syns.search(new TermQuery( new Term(Syns2Index.F_WORD, word)), new Collector() { - IndexReader reader; - - @Override - public boolean acceptsDocsOutOfOrder() { - return true; - } - - @Override - public void collect(int doc) throws IOException { - Document d = reader.document(doc); - String[] values = d.getValues( Syns2Index.F_SYN); - for ( int j = 0; j < values.length; j++) - { - String syn = values[ j]; - if ( already.add( syn)) // avoid dups of top level words and synonyms - { - TermQuery tq = new TermQuery( new Term( field, syn)); - if ( boost > 0) // else keep normal 1.0 - tq.setBoost( boost); - tmp.add( tq, BooleanClause.Occur.SHOULD); - } - } - } - - @Override - public void setNextReader(AtomicReaderContext context) - throws IOException { - this.reader = context.reader; - } - - @Override - public void setScorer(Scorer scorer) throws IOException {} - }); - - // [2b] add in unique synonums - } - - - return tmp; - } - -} diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java deleted file mode 100644 index 2c2fb14951d..00000000000 --- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java +++ /dev/null @@ -1,170 +0,0 @@ -package org.apache.lucene.wordnet; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.File; -import java.io.IOException; -import java.io.StringReader; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TotalHitCountCollector; -import org.apache.lucene.store.FSDirectory; - - -/** - * Test program to look up synonyms. - */ -public class SynLookup { - - public static void main(String[] args) throws IOException { - if (args.length != 2) { - System.out.println( - "java org.apache.lucene.wordnet.SynLookup "); - } - - FSDirectory directory = FSDirectory.open(new File(args[0])); - IndexSearcher searcher = new IndexSearcher(directory, true); - - String word = args[1]; - Query query = new TermQuery(new Term(Syns2Index.F_WORD, word)); - TotalHitCountCollector countingCollector = new TotalHitCountCollector(); - searcher.search(query, countingCollector); - - if (countingCollector.getTotalHits() == 0) { - System.out.println("No synonyms found for " + word); - } else { - System.out.println("Synonyms found for \"" + word + "\":"); - } - - ScoreDoc[] hits = searcher.search(query, countingCollector.getTotalHits()).scoreDocs; - - for (int i = 0; i < hits.length; i++) { - Document doc = searcher.doc(hits[i].doc); - - String[] values = doc.getValues(Syns2Index.F_SYN); - - for (int j = 0; j < values.length; j++) { - System.out.println(values[j]); - } - } - - searcher.close(); - directory.close(); - } - - - /** - * Perform synonym expansion on a query. - * - * @param query - * @param syns - * @param a - * @param field - * @param boost - */ - public static Query expand( String query, - IndexSearcher syns, - Analyzer a, - final String field, - final float boost) - throws IOException - { - final Set already = new HashSet(); // avoid dups - List top = new LinkedList(); // needs to be separately listed.. - - // [1] Parse query into separate words so that when we expand we can avoid dups - TokenStream ts = a.reusableTokenStream( field, new StringReader( query)); - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - - while (ts.incrementToken()) { - String word = termAtt.toString(); - if ( already.add( word)) - top.add( word); - } - final BooleanQuery tmp = new BooleanQuery(); - - // [2] form query - Iterator it = top.iterator(); - while ( it.hasNext()) - { - // [2a] add to level words in - String word = it.next(); - TermQuery tq = new TermQuery( new Term( field, word)); - tmp.add( tq, BooleanClause.Occur.SHOULD); - - // [2b] add in unique synonums - syns.search(new TermQuery( new Term(Syns2Index.F_WORD, word)), new Collector() { - IndexReader reader; - - @Override - public boolean acceptsDocsOutOfOrder() { - return true; - } - - @Override - public void collect(int doc) throws IOException { - Document d = reader.document(doc); - String[] values = d.getValues( Syns2Index.F_SYN); - for ( int j = 0; j < values.length; j++) - { - String syn = values[ j]; - if ( already.add( syn)) - { - TermQuery tq = new TermQuery( new Term( field, syn)); - if ( boost > 0) // else keep normal 1.0 - tq.setBoost( boost); - tmp.add( tq, BooleanClause.Occur.SHOULD); - } - } - } - - @Override - public void setNextReader(AtomicReaderContext context) - throws IOException { - this.reader = context.reader; - } - - @Override - public void setScorer(Scorer scorer) throws IOException {} - }); - } - - - return tmp; - } - -} diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymMap.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymMap.java deleted file mode 100644 index 099d653bef1..00000000000 --- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymMap.java +++ /dev/null @@ -1,400 +0,0 @@ -package org.apache.lucene.wordnet; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.TreeMap; -import java.util.TreeSet; - -/** - * Loads the WordNet prolog file wn_s.pl - * into a thread-safe main-memory hash map that can be used for fast - * high-frequency lookups of synonyms for any given (lowercase) word string. - *

- * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A). - * There does not necessarily hold: A -> B, B -> C then A -> C. - *

- * Loading typically takes some 1.5 secs, so should be done only once per - * (server) program execution, using a singleton pattern. Once loaded, a - * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1). - * A loaded default synonym map consumes about 10 MB main memory. - * An instance is immutable, hence thread-safe. - *

- * This implementation borrows some ideas from the Lucene Syns2Index demo that - * Dave Spencer originally contributed to Lucene. Dave's approach - * involved a persistent Lucene index which is suitable for occasional - * lookups or very large synonym tables, but considered unsuitable for - * high-frequency lookups of medium size synonym tables. - *

- * Example Usage: - *

- * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
- * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
- * for (int i = 0; i < words.length; i++) {
- *     String[] synonyms = map.getSynonyms(words[i]);
- *     System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
- * }
- * 
- * - * Example output: - *
- * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
- * woods:[forest, wood]
- * forest:[afforest, timber, timberland, wood, woodland, woods]
- * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
- * xxxx:[]
- * 
- * - *

- * See also:
- * prologdb - * man page
- * Dave's synonym demo site - */ -public class SynonymMap { - - /** the index data; Map */ - private final HashMap table; - - private static final String[] EMPTY = new String[0]; - - private static final boolean DEBUG = false; - - /** - * Constructs an instance, loading WordNet synonym data from the given input - * stream. Finally closes the stream. The words in the stream must be in - * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.). - * - * @param input - * the stream to read from (null indicates an empty synonym map) - * @throws IOException - * if an error occured while reading the stream. - */ - public SynonymMap(InputStream input) throws IOException { - this.table = input == null ? new HashMap(0) : read(toByteArray(input)); - } - - /** - * Returns the synonym set for the given word, sorted ascending. - * - * @param word - * the word to lookup (must be in lowercase). - * @return the synonyms; a set of zero or more words, sorted ascending, each - * word containing lowercase characters that satisfy - * Character.isLetter(). - */ - public String[] getSynonyms(String word) { - String[] synonyms = table.get(word); - if (synonyms == null) return EMPTY; - String[] copy = new String[synonyms.length]; // copy for guaranteed immutability - System.arraycopy(synonyms, 0, copy, 0, synonyms.length); - return copy; - } - - /** - * Returns a String representation of the index data for debugging purposes. - * - * @return a String representation - */ - @Override - public String toString() { - StringBuilder buf = new StringBuilder(); - Iterator iter = new TreeMap(table).keySet().iterator(); - int count = 0; - int f0 = 0; - int f1 = 0; - int f2 = 0; - int f3 = 0; - - while (iter.hasNext()) { - String word = iter.next(); - buf.append(word + ":"); - String[] synonyms = getSynonyms(word); - buf.append(Arrays.asList(synonyms)); - buf.append("\n"); - count += synonyms.length; - if (synonyms.length == 0) f0++; - if (synonyms.length == 1) f1++; - if (synonyms.length == 2) f2++; - if (synonyms.length == 3) f3++; - } - - buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3); - return buf.toString(); - } - - /** - * Analyzes/transforms the given word on input stream loading. This default implementation simply - * lowercases the word. Override this method with a custom stemming - * algorithm or similar, if desired. - * - * @param word - * the word to analyze - * @return the same word, or a different word (or null to indicate that the - * word should be ignored) - */ - protected String analyze(String word) { - return word.toLowerCase(); - } - - protected boolean isValid(String str) { - for (int i=str.length(); --i >= 0; ) { - if (!Character.isLetter(str.charAt(i))) return false; - } - return true; - } - - private HashMap read(byte[] data) { - int WORDS = (int) (76401 / 0.7); // presizing - int GROUPS = (int) (88022 / 0.7); // presizing - HashMap> word2Groups = new HashMap>(WORDS); // Map - HashMap> group2Words = new HashMap>(GROUPS); // Map - HashMap internedWords = new HashMap(WORDS);// Map - - Charset charset = Charset.forName("UTF-8"); - int lastNum = -1; - Integer lastGroup = null; - int len = data.length; - int i=0; - - while (i < len) { // until EOF - /* Part A: Parse a line */ - - // scan to beginning of group - while (i < len && data[i] != '(') i++; - if (i >= len) break; // EOF - i++; - - // parse group - int num = 0; - while (i < len && data[i] != ',') { - num = 10*num + (data[i] - 48); - i++; - } - i++; -// if (DEBUG) System.err.println("num="+ num); - - // scan to beginning of word - while (i < len && data[i] != '\'') i++; - i++; - - // scan to end of word - int start = i; - do { - while (i < len && data[i] != '\'') i++; - i++; - } while (i < len && data[i] != ','); // word must end with "'," - - if (i >= len) break; // EOF - String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString(); -// String word = new String(data, 0, start, i-start-1); // ASCII - - /* - * Part B: ignore phrases (with spaces and hyphens) and - * non-alphabetic words, and let user customize word (e.g. do some - * stemming) - */ - if (!isValid(word)) continue; // ignore - word = analyze(word); - if (word == null || word.length() == 0) continue; // ignore - - - /* Part C: Add (group,word) to tables */ - - // ensure compact string representation, minimizing memory overhead - String w = internedWords.get(word); - if (w == null) { - word = new String(word); // ensure compact string - internedWords.put(word, word); - } else { - word = w; - } - - Integer group = lastGroup; - if (num != lastNum) { - group = Integer.valueOf(num); - lastGroup = group; - lastNum = num; - } - - // add word --> group - ArrayList groups = word2Groups.get(word); - if (groups == null) { - groups = new ArrayList(1); - word2Groups.put(word, groups); - } - groups.add(group); - - // add group --> word - ArrayList words = group2Words.get(group); - if (words == null) { - words = new ArrayList(1); - group2Words.put(group, words); - } - words.add(word); - } - - - /* Part D: compute index data structure */ - HashMap word2Syns = createIndex(word2Groups, group2Words); - - /* Part E: minimize memory consumption by a factor 3 (or so) */ -// if (true) return word2Syns; - word2Groups = null; // help gc - //TODO: word2Groups.clear(); would be more appropriate ? - group2Words = null; // help gc - //TODO: group2Words.clear(); would be more appropriate ? - - return optimize(word2Syns, internedWords); - } - - private HashMap createIndex(Map> word2Groups, Map> group2Words) { - HashMap word2Syns = new HashMap(); - - for (final Map.Entry> entry : word2Groups.entrySet()) { // for each word - ArrayList group = entry.getValue(); - String word = entry.getKey(); - -// HashSet synonyms = new HashSet(); - TreeSet synonyms = new TreeSet(); - for (int i=group.size(); --i >= 0; ) { // for each groupID of word - ArrayList words = group2Words.get(group.get(i)); - for (int j=words.size(); --j >= 0; ) { // add all words - String synonym = words.get(j); // note that w and word are interned - if (synonym != word) { // a word is implicitly it's own synonym - synonyms.add(synonym); - } - } - } - - int size = synonyms.size(); - if (size > 0) { - String[] syns = new String[size]; - if (size == 1) - syns[0] = synonyms.first(); - else - synonyms.toArray(syns); -// if (syns.length > 1) Arrays.sort(syns); -// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns)); - word2Syns.put(word, syns); - } - } - - return word2Syns; - } - - private HashMap optimize(HashMap word2Syns, HashMap internedWords) { - if (DEBUG) { - System.err.println("before gc"); - for (int i=0; i < 10; i++) System.gc(); - System.err.println("after gc"); - } - - // collect entries - int len = 0; - int size = word2Syns.size(); - String[][] allSynonyms = new String[size][]; - String[] words = new String[size]; - Iterator> iter = word2Syns.entrySet().iterator(); - for (int j=0; j < size; j++) { - Map.Entry entry = iter.next(); - allSynonyms[j] = entry.getValue(); - words[j] = entry.getKey(); - len += words[j].length(); - } - - // assemble large string containing all words - StringBuilder buf = new StringBuilder(len); - for (int j=0; j < size; j++) buf.append(words[j]); - String allWords = new String(buf.toString()); // ensure compact string across JDK versions - buf = null; - - // intern words at app level via memory-overlaid substrings - for (int p=0, j=0; j < size; j++) { - String word = words[j]; - internedWords.put(word, allWords.substring(p, p + word.length())); - p += word.length(); - } - - // replace words with interned words - for (int j=0; j < size; j++) { - String[] syns = allSynonyms[j]; - for (int k=syns.length; --k >= 0; ) { - syns[k] = internedWords.get(syns[k]); - } - word2Syns.remove(words[j]); - word2Syns.put(internedWords.get(words[j]), syns); - } - - if (DEBUG) { - words = null; - allSynonyms = null; - internedWords = null; - allWords = null; - System.err.println("before gc"); - for (int i=0; i < 10; i++) System.gc(); - System.err.println("after gc"); - } - return word2Syns; - } - - // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux - private static byte[] toByteArray(InputStream input) throws IOException { - try { - // safe and fast even if input.available() behaves weird or buggy - int len = Math.max(256, input.available()); - byte[] buffer = new byte[len]; - byte[] output = new byte[len]; - - len = 0; - int n; - while ((n = input.read(buffer)) >= 0) { - if (len + n > output.length) { // grow capacity - byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; - System.arraycopy(output, 0, tmp, 0, len); - System.arraycopy(buffer, 0, tmp, len, n); - buffer = output; // use larger buffer for future larger bulk reads - output = tmp; - } else { - System.arraycopy(buffer, 0, output, len, n); - } - len += n; - } - - if (len == output.length) return output; - buffer = null; // help gc - buffer = new byte[len]; - System.arraycopy(output, 0, buffer, 0, len); - return buffer; - } finally { - input.close(); - } - } - -} diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymTokenFilter.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymTokenFilter.java deleted file mode 100644 index e4b45a0c691..00000000000 --- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynonymTokenFilter.java +++ /dev/null @@ -1,148 +0,0 @@ -package org.apache.lucene.wordnet; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.AttributeSource; - -/** - * Injects additional tokens for synonyms of token terms fetched from the - * underlying child stream; the child stream must deliver lowercase tokens - * for synonyms to be found. - * - */ -public class SynonymTokenFilter extends TokenFilter { - - /** The Token.type used to indicate a synonym to higher level filters. */ - public static final String SYNONYM_TOKEN_TYPE = "SYNONYM"; - - private final SynonymMap synonyms; - private final int maxSynonyms; - - private String[] stack = null; - private int index = 0; - private AttributeSource.State current = null; - private int todo = 0; - - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - - /** - * Creates an instance for the given underlying stream and synonym table. - * - * @param input - * the underlying child token stream - * @param synonyms - * the map used to extract synonyms for terms - * @param maxSynonyms - * the maximum number of synonym tokens to return per underlying - * token word (a value of Integer.MAX_VALUE indicates unlimited) - */ - public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) { - super(input); - if (input == null) - throw new IllegalArgumentException("input must not be null"); - if (synonyms == null) - throw new IllegalArgumentException("synonyms must not be null"); - if (maxSynonyms < 0) - throw new IllegalArgumentException("maxSynonyms must not be negative"); - - this.synonyms = synonyms; - this.maxSynonyms = maxSynonyms; - } - - /** Returns the next token in the stream, or null at EOS. */ - @Override - public final boolean incrementToken() throws IOException { - while (todo > 0 && index < stack.length) { // pop from stack - if (createToken(stack[index++], current)) { - todo--; - return true; - } - } - - if (!input.incrementToken()) return false; // EOS; iterator exhausted - - stack = synonyms.getSynonyms(termAtt.toString()); // push onto stack - if (stack.length > maxSynonyms) randomize(stack); - index = 0; - current = captureState(); - todo = maxSynonyms; - return true; - } - - /** - * Creates and returns a token for the given synonym of the current input - * token; Override for custom (stateless or stateful) behavior, if desired. - * - * @param synonym - * a synonym for the current token's term - * @param current - * the current token from the underlying child stream - * @return a new token, or null to indicate that the given synonym should be - * ignored - */ - protected boolean createToken(String synonym, AttributeSource.State current) { - restoreState(current); - termAtt.setEmpty().append(synonym); - typeAtt.setType(SYNONYM_TOKEN_TYPE); - posIncrAtt.setPositionIncrement(0); - return true; - } - - /** - * Randomize synonyms to later sample a subset. Uses constant random seed - * for reproducibility. Uses "DRand", a simple, fast, uniform pseudo-random - * number generator with medium statistical quality (multiplicative - * congruential method), producing integers in the range [Integer.MIN_VALUE, - * Integer.MAX_VALUE]. - */ - private static void randomize(Object[] arr) { - int seed = 1234567; // constant - int randomState = 4*seed + 1; -// Random random = new Random(seed); // unnecessary overhead - int len = arr.length; - for (int i=0; i < len-1; i++) { - randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32) - int r = randomState % (len-i); - if (r < 0) r = -r; // e.g. -9 % 2 == -1 -// int r = random.nextInt(len-i); - - // swap arr[i, i+r] - Object tmp = arr[i]; - arr[i] = arr[i + r]; - arr[i + r] = tmp; - } - } - - @Override - public void reset() throws IOException { - super.reset(); - stack = null; - index = 0; - current = null; - todo = 0; - } -} diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/Syns2Index.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/Syns2Index.java deleted file mode 100644 index 8d3ea0c3d60..00000000000 --- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/Syns2Index.java +++ /dev/null @@ -1,329 +0,0 @@ -package org.apache.lucene.wordnet; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.io.PrintStream; -import java.io.Reader; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.TieredMergePolicy; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.Version; - -/** - * Convert the prolog file wn_s.pl from the WordNet prolog download - * into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}). - * - * This has been tested with WordNet 2.0. - * - * The index has fields named "word" ({@link #F_WORD}) - * and "syn" ({@link #F_SYN}). - *

- * The source word (such as 'big') can be looked up in the - * "word" field, and if present there will be fields named "syn" - * for every synonym. What's tricky here is that there could be multiple - * fields with the same name, in the general case for words that have multiple synonyms. - * That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues} - *

- *

- * While the WordNet file distinguishes groups of synonyms with - * related meanings we don't do that here. - *

- * - * This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB. - * - * @see WordNet home page - * @see prologdb man page - * @see sample site that uses it - */ -public class Syns2Index -{ - /** - * - */ - private static final PrintStream o = System.out; - - /** - * - */ - private static final PrintStream err = System.err; - - /** - * - */ - public static final String F_SYN = "syn"; - - /** - * - */ - public static final String F_WORD = "word"; - - /** - * we don't actually analyze any text (only a NOT_ANALYZED field), - * but analyzer can't be null, docinverter wants the offset gap! - */ - private static final Analyzer ana = new Analyzer() { - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return null; - } - }; - - /** - * Takes arg of prolog file name and index directory. - */ - public static void main(String[] args) - throws Throwable - { - // get command line arguments - String prologFilename = null; // name of file "wn_s.pl" - String indexDir = null; - if (args.length == 2) - { - prologFilename = args[0]; - indexDir = args[1]; - } - else - { - usage(); - System.exit(1); - } - - // ensure that the prolog file is readable - if (! (new File(prologFilename)).canRead()) - { - err.println("Error: cannot read Prolog file: " + prologFilename); - System.exit(1); - } - // exit if the target index directory already exists - if ((new File(indexDir)).isDirectory()) - { - err.println("Error: index directory already exists: " + indexDir); - err.println("Please specify a name of a non-existent directory"); - System.exit(1); - } - - o.println("Opening Prolog file " + prologFilename); - final FileInputStream fis = new FileInputStream(prologFilename); - final BufferedReader br = new BufferedReader(new InputStreamReader(fis)); - String line; - - // maps a word to all the "groups" it's in - final Map> word2Nums = new TreeMap>(); - // maps a group to all the words in it - final Map> num2Words = new TreeMap>(); - // number of rejected words - int ndecent = 0; - - // status output - int mod = 1; - int row = 1; - // parse prolog file - o.println( "[1/2] Parsing " + prologFilename); - while ((line = br.readLine()) != null) - { - // occasional progress - if ((++row) % mod == 0) // periodically print out line we read in - { - mod *= 2; - o.println("\t" + row + " " + line + " " + word2Nums.size() - + " " + num2Words.size() + " ndecent=" + ndecent); - } - - // syntax check - if (! line.startsWith("s(")) - { - err.println("OUCH: " + line); - System.exit(1); - } - - // parse line - line = line.substring(2); - int comma = line.indexOf(','); - String num = line.substring(0, comma); - int q1 = line.indexOf('\''); - line = line.substring(q1 + 1); - int q2 = line.lastIndexOf('\''); - String word = line.substring(0, q2).toLowerCase().replace("''", "'"); - - // make sure is a normal word - if (! isDecent(word)) - { - ndecent++; - continue; // don't store words w/ spaces - } - - // 1/2: word2Nums map - // append to entry or add new one - List lis = word2Nums.get(word); - if (lis == null) - { - lis = new LinkedList(); - lis.add(num); - word2Nums.put(word, lis); - } - else - lis.add(num); - - // 2/2: num2Words map - lis = num2Words.get(num); - if (lis == null) - { - lis = new LinkedList(); - lis.add(word); - num2Words.put(num, lis); - } - else - lis.add(word); - } - - // close the streams - fis.close(); - br.close(); - - // create the index - o.println( "[2/2] Building index to store synonyms, " + - " map sizes are " + word2Nums.size() + " and " + num2Words.size()); - index(indexDir, word2Nums, num2Words); - } - - /** - * Checks to see if a word contains only alphabetic characters by - * checking it one character at a time. - * - * @param s string to check - * @return true if the string is decent - */ - private static boolean isDecent(String s) - { - int len = s.length(); - for (int i = 0; i < len; i++) - { - if (!Character.isLetter(s.charAt(i))) - { - return false; - } - } - return true; - } - - /** - * Forms a Lucene index based on the 2 maps. - * - * @param indexDir the directory where the index should be created - * @param word2Nums - * @param num2Words - */ - private static void index(String indexDir, Map> word2Nums, Map> num2Words) - throws Throwable - { - int row = 0; - int mod = 1; - FSDirectory dir = FSDirectory.open(new File(indexDir)); - try { - - // override the specific index if it already exists - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( - Version.LUCENE_CURRENT, ana).setOpenMode(OpenMode.CREATE)); - ((TieredMergePolicy) writer.getConfig().getMergePolicy()).setUseCompoundFile(true); // why? - Iterator i1 = word2Nums.keySet().iterator(); - while (i1.hasNext()) // for each word - { - String g = i1.next(); - Document doc = new Document(); - - int n = index(word2Nums, num2Words, g, doc); - if (n > 0) - { - doc.add( new Field( F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED)); - if ((++row % mod) == 0) - { - o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc); - mod *= 2; - } - writer.addDocument(doc); - } // else degenerate - } - o.println( "Optimizing.."); - writer.optimize(); - writer.close(); - } finally { - dir.close(); - } - } - - /** - * Given the 2 maps fills a document for 1 word. - */ - private static int index(Map> word2Nums, Map> num2Words, String g, Document doc) - throws Throwable - { - List keys = word2Nums.get(g); // get list of key#'s - Iterator i2 = keys.iterator(); - - Set already = new TreeSet(); // keep them sorted - - // pass 1: fill up 'already' with all words - while (i2.hasNext()) // for each key# - { - already.addAll(num2Words.get(i2.next())); // get list of words - } - int num = 0; - already.remove(g); // of course a word is it's own syn - Iterator it = already.iterator(); - while (it.hasNext()) - { - String cur = it.next(); - // don't store things like 'pit bull' -> 'american pit bull' - if (!isDecent(cur)) - { - continue; - } - num++; - doc.add( new Field( F_SYN, cur, Field.Store.YES, Field.Index.NO)); - } - return num; - } - - /** - * - */ - private static void usage() - { - o.println("\n\n" + - "java org.apache.lucene.wordnet.Syns2Index \n\n"); - } - -} diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/package.html b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/package.html deleted file mode 100755 index 19c5b579ba4..00000000000 --- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/package.html +++ /dev/null @@ -1,57 +0,0 @@ - - - - -WordNet Lucene Synonyms Integration - - - - This package uses synonyms defined by WordNet. - There are two methods: query expansion and analysis. - - Both methods first require you to download the WordNet prolog database - Inside this archive is a file named wn_s.pl, which contains the WordNet synonyms. - -

Query Expansion Method

- This method creates Lucene index storing the synonyms, which in turn can be used for query expansion. - - You normally run {@link org.apache.lucene.wordnet.Syns2Index} once to build the query index/"database", and then call - {@link org.apache.lucene.wordnet.SynExpand#expand SynExpand.expand(...)} to expand a query. - -

- -

Instructions

-
    -
  1. Invoke Syn2Index as appropriate to build a synonym index. - It'll take 2 arguments, the path to wn_s.pl from the WordNet download, and the index name. - -
  2. Update your UI so that as appropriate you call SynExpand.expand(...) to expand user queries with synonyms. -
- -

Analysis Method

- This method injects additional synonym tokens for tokens from a child {@link org.apache.lucene.analysis.TokenStream}. - -

Instructions

-
    -
  1. Create a {@link org.apache.lucene.wordnet.SynonymMap}, passing in the path to wn_s.pl -
  2. Add a {@link org.apache.lucene.wordnet.SynonymTokenFilter} to your analyzer. Note: SynonymTokenFilter should be after LowerCaseFilter, - because it expects terms to already be in lowercase. -
- - - \ No newline at end of file diff --git a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java b/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java deleted file mode 100644 index 6959a3ed0a8..00000000000 --- a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java +++ /dev/null @@ -1,119 +0,0 @@ -package org.apache.lucene.wordnet; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Reader; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; - -public class TestSynonymTokenFilter extends BaseTokenStreamTestCase { - final String testFile = "testSynonyms.txt"; - - public void testSynonyms() throws Exception { - SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile)); - /* all expansions */ - Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE); - assertAnalyzesTo(analyzer, "Lost in the woods", - new String[] { "lost", "in", "the", "woods", "forest", "wood" }, - new int[] { 0, 5, 8, 12, 12, 12 }, - new int[] { 4, 7, 11, 17, 17, 17 }, - new int[] { 1, 1, 1, 1, 0, 0 }); - } - - public void testSynonymsSingleQuote() throws Exception { - SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile)); - /* all expansions */ - Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE); - assertAnalyzesTo(analyzer, "king", - new String[] { "king", "baron" }); - } - - public void testSynonymsLimitedAmount() throws Exception { - SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile)); - /* limit to one synonym expansion */ - Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1); - assertAnalyzesTo(analyzer, "Lost in the woods", - /* wood comes before forest due to - * the input file, not lexicographic order - */ - new String[] { "lost", "in", "the", "woods", "wood" }, - new int[] { 0, 5, 8, 12, 12 }, - new int[] { 4, 7, 11, 17, 17 }, - new int[] { 1, 1, 1, 1, 0 }); - } - - public void testReusableTokenStream() throws Exception { - SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile)); - /* limit to one synonym expansion */ - Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1); - assertAnalyzesToReuse(analyzer, "Lost in the woods", - new String[] { "lost", "in", "the", "woods", "wood" }, - new int[] { 0, 5, 8, 12, 12 }, - new int[] { 4, 7, 11, 17, 17 }, - new int[] { 1, 1, 1, 1, 0 }); - assertAnalyzesToReuse(analyzer, "My wolfish dog went to the forest", - new String[] { "my", "wolfish", "ravenous", "dog", "went", "to", - "the", "forest", "woods" }, - new int[] { 0, 3, 3, 11, 15, 20, 23, 27, 27 }, - new int[] { 2, 10, 10, 14, 19, 22, 26, 33, 33 }, - new int[] { 1, 1, 0, 1, 1, 1, 1, 1, 0 }); - } - - private class SynonymWhitespaceAnalyzer extends Analyzer { - private SynonymMap synonyms; - private int maxSynonyms; - - public SynonymWhitespaceAnalyzer(SynonymMap synonyms, int maxSynonyms) { - this.synonyms = synonyms; - this.maxSynonyms = maxSynonyms; - } - - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream ts = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); - ts = new SynonymTokenFilter(ts, synonyms, maxSynonyms); - return ts; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - } - - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); - streams.result = new SynonymTokenFilter(streams.source, synonyms, maxSynonyms); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; - } - } - -} diff --git a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestWordnet.java b/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestWordnet.java deleted file mode 100644 index ccd855931a5..00000000000 --- a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestWordnet.java +++ /dev/null @@ -1,94 +0,0 @@ -package org.apache.lucene.wordnet; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.File; -import java.io.IOException; - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util._TestUtil; - -public class TestWordnet extends LuceneTestCase { - private IndexSearcher searcher; - private Directory dir; - - String storePathName = new File(TEMP_DIR,"testLuceneWordnet").getAbsolutePath(); - - @Override - public void setUp() throws Exception { - super.setUp(); - // create a temporary synonym index - File testFile = getDataFile("testSynonyms.txt"); - String commandLineArgs[] = { testFile.getAbsolutePath(), storePathName }; - _TestUtil.rmDir(new File(storePathName)); - - try { - Syns2Index.main(commandLineArgs); - } catch (Throwable t) { throw new RuntimeException(t); } - - dir = newFSDirectory(new File(storePathName)); - searcher = new IndexSearcher(dir, true); - } - - public void testExpansion() throws IOException { - assertExpandsTo("woods", new String[] { "woods", "forest", "wood" }); - } - - public void testExpansionSingleQuote() throws IOException { - assertExpandsTo("king", new String[] { "king", "baron" }); - } - - private void assertExpandsTo(String term, String expected[]) throws IOException { - Query expandedQuery = SynExpand.expand(term, searcher, new - MockAnalyzer(random), "field", 1F); - BooleanQuery expectedQuery = new BooleanQuery(); - for (String t : expected) - expectedQuery.add(new TermQuery(new Term("field", t)), - BooleanClause.Occur.SHOULD); - assertEquals(expectedQuery, expandedQuery); - } - - @Override - public void tearDown() throws Exception { - if (searcher != null) { - searcher.close(); - } - if (dir != null) { - dir.close(); - } - rmDir(storePathName); // delete our temporary synonym index - super.tearDown(); - } - - private void rmDir(String directory) { - File dir = new File(directory); - File[] files = dir.listFiles(); - for (int i = 0; i < files.length; i++) { - files[i].delete(); - } - dir.delete(); - } -} diff --git a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/testSynonyms.txt b/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/testSynonyms.txt deleted file mode 100644 index 822bc96858c..00000000000 --- a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/testSynonyms.txt +++ /dev/null @@ -1,9 +0,0 @@ -s(100000001,1,'woods',n,1,0). -s(100000001,2,'wood',n,1,0). -s(100000001,3,'forest',n,1,0). -s(100000002,1,'wolfish',n,1,0). -s(100000002,2,'ravenous',n,1,0). -s(100000003,1,'king',n,1,1). -s(100000003,2,'baron',n,1,1). -s(100000004,1,'king''sevil',n,1,1). -s(100000004,2,'meany',n,1,1). diff --git a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java index 0c6f51493c8..78f39da6523 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java @@ -95,9 +95,6 @@ public class MemoryCodec extends Codec { this.out = out; this.field = field; builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - - // The byte[] output we create can easily be > 255 bytes: - builder.setAllowArrayArcs(false); } private class PostingsWriter extends PostingsConsumer { diff --git a/lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java b/lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java new file mode 100644 index 00000000000..0c0a92145d9 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/store/ByteArrayDataOutput.java @@ -0,0 +1,52 @@ +package org.apache.lucene.store; + +import org.apache.lucene.util.BytesRef; + +/** + * @lucene.experimental + */ +public class ByteArrayDataOutput extends DataOutput { + private byte[] bytes; + + private int pos; + private int limit; + + public ByteArrayDataOutput(byte[] bytes) { + reset(bytes); + } + + public ByteArrayDataOutput(byte[] bytes, int offset, int len) { + reset(bytes, offset, len); + } + + public ByteArrayDataOutput() { + reset(BytesRef.EMPTY_BYTES); + } + + public void reset(byte[] bytes) { + reset(bytes, 0, bytes.length); + } + + public void reset(byte[] bytes, int offset, int len) { + this.bytes = bytes; + pos = offset; + limit = offset + len; + } + + public int getPosition() { + return pos; + } + + @Override + public void writeByte(byte b) { + assert pos < limit; + bytes[pos++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + assert pos + length <= limit; + System.arraycopy(b, offset, bytes, pos, length); + pos += length; + } +} diff --git a/lucene/src/java/org/apache/lucene/util/CharsRef.java b/lucene/src/java/org/apache/lucene/util/CharsRef.java index 2d87a0dabfd..088d9faaa90 100644 --- a/lucene/src/java/org/apache/lucene/util/CharsRef.java +++ b/lucene/src/java/org/apache/lucene/util/CharsRef.java @@ -1,5 +1,7 @@ package org.apache.lucene.util; +import java.util.Comparator; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -167,7 +169,11 @@ public final class CharsRef implements Comparable, CharSequence { * the {@link CharsRef} to copy */ public void copy(CharsRef other) { - chars = ArrayUtil.grow(chars, other.length); + if (chars == null) { + chars = new char[other.length]; + } else { + chars = ArrayUtil.grow(chars, other.length); + } System.arraycopy(other.chars, other.offset, chars, 0, other.length); length = other.length; offset = 0; @@ -213,4 +219,56 @@ public final class CharsRef implements Comparable, CharSequence { public CharSequence subSequence(int start, int end) { return new CharsRef(chars, offset + start, offset + end - 1); } + + private final static Comparator utf16SortedAsUTF8SortOrder = new UTF16SortedAsUTF8Comparator(); + + public static Comparator getUTF16SortedAsUTF8Comparator() { + return utf16SortedAsUTF8SortOrder; + } + + private static class UTF16SortedAsUTF8Comparator implements Comparator { + // Only singleton + private UTF16SortedAsUTF8Comparator() {}; + + public int compare(CharsRef a, CharsRef b) { + if (a == b) + return 0; + + final char[] aChars = a.chars; + int aUpto = a.offset; + final char[] bChars = b.chars; + int bUpto = b.offset; + + final int aStop = aUpto + Math.min(a.length, b.length); + + while (aUpto < aStop) { + char aChar = aChars[aUpto++]; + char bChar = bChars[bUpto++]; + if (aChar != bChar) { + // http://icu-project.org/docs/papers/utf16_code_point_order.html + + /* aChar != bChar, fix up each one if they're both in or above the surrogate range, then compare them */ + if (aChar >= 0xd800 && bChar >= 0xd800) { + if (aChar >= 0xe000) { + aChar -= 0x800; + } else { + aChar += 0x2000; + } + + if (bChar >= 0xe000) { + bChar -= 0x800; + } else { + bChar += 0x2000; + } + } + + /* now aChar and bChar are in code point order */ + return (int)aChar - (int)bChar; /* int must be 32 bits wide */ + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + } } \ No newline at end of file diff --git a/lucene/src/java/org/apache/lucene/util/fst/FST.java b/lucene/src/java/org/apache/lucene/util/fst/FST.java index 7fa3339a256..ccc49cd0f87 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/src/java/org/apache/lucene/util/fst/FST.java @@ -71,7 +71,11 @@ public class FST { // Increment version to change it private final static String FILE_FORMAT_NAME = "FST"; private final static int VERSION_START = 0; - private final static int VERSION_CURRENT = VERSION_START; + + /** Changed numBytesPerArc for array'd case from byte to int. */ + private final static int VERSION_INT_NUM_BYTES_PER_ARC = 1; + + private final static int VERSION_CURRENT = VERSION_INT_NUM_BYTES_PER_ARC; // Never serialized; just used to represent the virtual // final node w/ no arcs: @@ -106,6 +110,8 @@ public class FST { private boolean allowArrayArcs = true; + private Arc cachedRootArcs[]; + public final static class Arc { public int label; public T output; @@ -113,7 +119,7 @@ public class FST { int target; byte flags; - T nextFinalOutput; + public T nextFinalOutput; int nextArc; // This is non-zero if current arcs are fixed array: @@ -176,7 +182,7 @@ public class FST { public FST(DataInput in, Outputs outputs) throws IOException { this.outputs = outputs; writer = null; - CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_START); + CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_INT_NUM_BYTES_PER_ARC, VERSION_INT_NUM_BYTES_PER_ARC); if (in.readByte() == 1) { // accepts empty string int numBytes = in.readVInt(); @@ -209,6 +215,8 @@ public class FST { bytes = new byte[in.readVInt()]; in.readBytes(bytes, 0, bytes.length); NO_OUTPUT = outputs.getNoOutput(); + + cacheRootArcs(); } public INPUT_TYPE getInputType() { @@ -220,7 +228,7 @@ public class FST { return bytes.length; } - void finish(int startNode) { + void finish(int startNode) throws IOException { if (startNode == FINAL_END_NODE && emptyOutput != null) { startNode = 0; } @@ -231,6 +239,32 @@ public class FST { System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite); bytes = finalBytes; this.startNode = startNode; + + cacheRootArcs(); + } + + // Caches first 128 labels + @SuppressWarnings("unchecked") + private void cacheRootArcs() throws IOException { + cachedRootArcs = (FST.Arc[]) new FST.Arc[0x80]; + final FST.Arc arc = new FST.Arc(); + getFirstArc(arc); + final BytesReader in = getBytesReader(0); + if (targetHasArcs(arc)) { + readFirstRealArc(arc.target, arc); + while(true) { + assert arc.label != END_LABEL; + if (arc.label < cachedRootArcs.length) { + cachedRootArcs[arc.label] = new Arc().copyFrom(arc); + } else { + break; + } + if (arc.isLast()) { + break; + } + readNextRealArc(arc, in); + } + } } void setEmptyOutput(T v) throws IOException { @@ -345,8 +379,9 @@ public class FST { writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY); writer.writeVInt(node.numArcs); // placeholder -- we'll come back and write the number - // of bytes per arc here: - writer.writeByte((byte) 0); + // of bytes per arc (int) here: + // TODO: we could make this a vInt instead + writer.writeInt(0); fixedArrayStart = writer.posWrite; //System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart); } else { @@ -421,15 +456,21 @@ public class FST { } } + // TODO: if arc'd arrays will be "too wasteful" by some + // measure, eg if arcs have vastly different sized + // outputs, then we should selectively disable array for + // such cases + if (doFixedArray) { assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed // byte size final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc; bytes = ArrayUtil.grow(bytes, sizeNeeded); - if (maxBytesPerArc > 255) { - throw new IllegalStateException("max arc size is too large (" + maxBytesPerArc + "); disable array arcs by calling Builder.setAllowArrayArcs(false)"); - } + // TODO: we could make this a vInt instead + bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24); + bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16); + bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8); bytes[fixedArrayStart-1] = (byte) maxBytesPerArc; // expand the arcs in place, backwards @@ -502,7 +543,7 @@ public class FST { if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { // array: jump straight to end arc.numArcs = in.readVInt(); - arc.bytesPerArc = in.readByte() & 0xFF; + arc.bytesPerArc = in.readInt(); //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); arc.posArcsStart = in.pos; arc.arcIdx = arc.numArcs - 2; @@ -528,7 +569,7 @@ public class FST { } arc.nextArc = in.pos+1; } - readNextRealArc(arc); + readNextRealArc(arc, in); assert arc.isLast(); return arc; } @@ -572,7 +613,7 @@ public class FST { //System.out.println(" fixedArray"); // this is first arc in a fixed-array arc.numArcs = in.readVInt(); - arc.bytesPerArc = in.readByte() & 0xFF; + arc.bytesPerArc = in.readInt(); arc.arcIdx = -1; arc.nextArc = arc.posArcsStart = in.pos; //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); @@ -580,7 +621,7 @@ public class FST { arc.nextArc = address; arc.bytesPerArc = 0; } - return readNextRealArc(arc); + return readNextRealArc(arc, in); } /** @@ -609,7 +650,7 @@ public class FST { } return readFirstRealArc(arc.nextArc, arc); } else { - return readNextRealArc(arc); + return readNextRealArc(arc, getBytesReader(0)); } } @@ -627,7 +668,7 @@ public class FST { //System.out.println(" nextArc fake array"); in.pos--; in.readVInt(); - in.readByte(); + in.readInt(); } } else { if (arc.bytesPerArc != 0) { @@ -645,17 +686,16 @@ public class FST { return readLabel(in); } - Arc readNextRealArc(Arc arc) throws IOException { + Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { // this is a continuing arc in a fixed array - final BytesReader in; if (arc.bytesPerArc != 0) { // arcs are at fixed entries arc.arcIdx++; assert arc.arcIdx < arc.numArcs; - in = getBytesReader(arc.posArcsStart - arc.arcIdx*arc.bytesPerArc); + in.pos = arc.posArcsStart - arc.arcIdx*arc.bytesPerArc; } else { // arcs are packed - in = getBytesReader(arc.nextArc); + in.pos = arc.nextArc; } arc.flags = in.readByte(); arc.label = readLabel(in); @@ -701,7 +741,18 @@ public class FST { /** Finds an arc leaving the incoming arc, replacing the arc in place. * This returns null if the arc was not found, else the incoming arc. */ public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc) throws IOException { - + assert cachedRootArcs != null; + // Short-circuit if this arc is in the root arc cache: + if (follow.target == startNode && labelToMatch != END_LABEL && labelToMatch < cachedRootArcs.length) { + final Arc result = cachedRootArcs[labelToMatch]; + if (result == null) { + return result; + } else { + arc.copyFrom(result); + return arc; + } + } + if (labelToMatch == END_LABEL) { if (follow.isFinal()) { if (follow.target <= 0) { @@ -726,14 +777,18 @@ public class FST { // reusable stuff eg BytesReader: final BytesReader in = getBytesReader(follow.target); + // System.out.println("fta label=" + (char) labelToMatch); + if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) { // Arcs are full array; do binary search: arc.numArcs = in.readVInt(); - arc.bytesPerArc = in.readByte() & 0xFF; + //System.out.println(" bs " + arc.numArcs); + arc.bytesPerArc = in.readInt(); arc.posArcsStart = in.pos; int low = 0; int high = arc.numArcs-1; while (low <= high) { + //System.out.println(" cycle"); int mid = (low + high) >>> 1; in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; int midLabel = readLabel(in); @@ -744,7 +799,8 @@ public class FST { high = mid - 1; else { arc.arcIdx = mid-1; - return readNextRealArc(arc); + //System.out.println(" found!"); + return readNextRealArc(arc, in); } } @@ -754,7 +810,12 @@ public class FST { // Linear scan readFirstTargetArc(follow, arc); while(true) { + //System.out.println(" non-bs cycle"); + // TODO: we should fix this code to not have to create + // object for the output of every arc we scan... only + // for the matching arc, if found if (arc.label == labelToMatch) { + //System.out.println(" found!"); return arc; } else if (arc.label > labelToMatch) { return null; @@ -863,7 +924,7 @@ public class FST { } // Non-static: reads byte[] from FST - class BytesReader extends DataInput { + final class BytesReader extends DataInput { int pos; public BytesReader(int pos) { diff --git a/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java b/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java index 927c1c118ec..a6c4b66c9c1 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java +++ b/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java @@ -170,7 +170,7 @@ abstract class FSTEnum { if (found) { // Match arc.arcIdx = mid-1; - fst.readNextRealArc(arc); + fst.readNextRealArc(arc, in); assert arc.arcIdx == mid; assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid; output[upto] = fst.outputs.add(output[upto-1], arc.output); @@ -185,7 +185,7 @@ abstract class FSTEnum { } else if (low == arc.numArcs) { // Dead end arc.arcIdx = arc.numArcs-2; - fst.readNextRealArc(arc); + fst.readNextRealArc(arc, in); assert arc.isLast(); // Dead end (target is after the last arc); // rollback to last fork then push @@ -205,7 +205,7 @@ abstract class FSTEnum { } } else { arc.arcIdx = (low > high ? low : high)-1; - fst.readNextRealArc(arc); + fst.readNextRealArc(arc, in); assert arc.label > targetLabel; pushFirst(); return; @@ -309,7 +309,7 @@ abstract class FSTEnum { // Match -- recurse //System.out.println(" match! arcIdx=" + mid); arc.arcIdx = mid-1; - fst.readNextRealArc(arc); + fst.readNextRealArc(arc, in); assert arc.arcIdx == mid; assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid; output[upto] = fst.outputs.add(output[upto-1], arc.output); @@ -352,7 +352,7 @@ abstract class FSTEnum { // There is a floor arc: arc.arcIdx = (low > high ? high : low)-1; //System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1)); - fst.readNextRealArc(arc); + fst.readNextRealArc(arc, in); assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel; assert arc.label < targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel; pushLast(); diff --git a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java index a10376e7472..276aa997214 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -35,6 +35,7 @@ final class NodeHash { } private boolean nodesEqual(Builder.UnCompiledNode node, int address) throws IOException { + final FST.BytesReader in = fst.getBytesReader(0); fst.readFirstRealArc(address, scratchArc); if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { return false; @@ -56,7 +57,7 @@ final class NodeHash { return false; } } - fst.readNextRealArc(scratchArc); + fst.readNextRealArc(scratchArc, in); } return false; @@ -87,6 +88,7 @@ final class NodeHash { // hash code for a frozen node private int hash(int node) throws IOException { final int PRIME = 31; + final FST.BytesReader in = fst.getBytesReader(0); //System.out.println("hash frozen"); int h = 0; fst.readFirstRealArc(node, scratchArc); @@ -102,7 +104,7 @@ final class NodeHash { if (scratchArc.isLast()) { break; } - fst.readNextRealArc(scratchArc); + fst.readNextRealArc(scratchArc, in); } //System.out.println(" ret " + (h&Integer.MAX_VALUE)); return h & Integer.MAX_VALUE; diff --git a/lucene/src/site/build/site/contributions.html b/lucene/src/site/build/site/contributions.html index 72f4e9ca82c..1e2ecc0a4dd 100644 --- a/lucene/src/site/build/site/contributions.html +++ b/lucene/src/site/build/site/contributions.html @@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker - diff --git a/lucene/src/site/build/site/demo.html b/lucene/src/site/build/site/demo.html index 24251b5f2a7..90373dfe61e 100644 --- a/lucene/src/site/build/site/demo.html +++ b/lucene/src/site/build/site/demo.html @@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker - diff --git a/lucene/src/site/build/site/demo2.html b/lucene/src/site/build/site/demo2.html index b369c658972..0916963ef3c 100644 --- a/lucene/src/site/build/site/demo2.html +++ b/lucene/src/site/build/site/demo2.html @@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker - diff --git a/lucene/src/site/build/site/fileformats.html b/lucene/src/site/build/site/fileformats.html index ef91a18e36f..da02cf70a98 100644 --- a/lucene/src/site/build/site/fileformats.html +++ b/lucene/src/site/build/site/fileformats.html @@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker - diff --git a/lucene/src/site/build/site/gettingstarted.html b/lucene/src/site/build/site/gettingstarted.html index c83de12ffa0..a50a3581ede 100644 --- a/lucene/src/site/build/site/gettingstarted.html +++ b/lucene/src/site/build/site/gettingstarted.html @@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker - diff --git a/lucene/src/site/build/site/index.html b/lucene/src/site/build/site/index.html index 75ca1fcc910..bd258d4fd1d 100644 --- a/lucene/src/site/build/site/index.html +++ b/lucene/src/site/build/site/index.html @@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker - diff --git a/lucene/src/site/build/site/linkmap.html b/lucene/src/site/build/site/linkmap.html index cb546159dbf..c4b46090f85 100644 --- a/lucene/src/site/build/site/linkmap.html +++ b/lucene/src/site/build/site/linkmap.html @@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker - @@ -358,12 +355,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker  ___________________  javadoc-contrib-spellchecker - -
    -
  • -Wordnet  ___________________  javadoc-contrib-wordnet -
  • -
  • diff --git a/lucene/src/site/build/site/lucene-contrib/index.html b/lucene/src/site/build/site/lucene-contrib/index.html index 3d34f87c119..6f511ac85fb 100644 --- a/lucene/src/site/build/site/lucene-contrib/index.html +++ b/lucene/src/site/build/site/lucene-contrib/index.html @@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker - @@ -263,9 +260,6 @@ document.write("Last Published: " + document.lastModified); spellchecker
  • -wordnet -
  • -
  • xml-query-parser
@@ -375,12 +369,7 @@ document.write("Last Published: " + document.lastModified);

Provides tools for spellchecking and suggestions with Lucene.

See spellchecker javadoc

- -

wordnet

-

Tools to help utilize wordnet synonyms with Lucene

-

See wordnet javadoc -

- +

xml-query-parser

A QueryParser that can read queries written in an XML format.

See xml-query-parser javadoc diff --git a/lucene/src/site/build/site/queryparsersyntax.html b/lucene/src/site/build/site/queryparsersyntax.html index f2c9d6929e6..ba748aa2262 100644 --- a/lucene/src/site/build/site/queryparsersyntax.html +++ b/lucene/src/site/build/site/queryparsersyntax.html @@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker

- diff --git a/lucene/src/site/build/site/scoring.html b/lucene/src/site/build/site/scoring.html index daba6794660..4fe632a7665 100644 --- a/lucene/src/site/build/site/scoring.html +++ b/lucene/src/site/build/site/scoring.html @@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker - diff --git a/lucene/src/site/build/site/systemrequirements.html b/lucene/src/site/build/site/systemrequirements.html index 668f89d3503..94e98b7a3fd 100644 --- a/lucene/src/site/build/site/systemrequirements.html +++ b/lucene/src/site/build/site/systemrequirements.html @@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified); Spellchecker - diff --git a/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml b/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml index 8d156ccee7e..749364202c1 100644 --- a/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml +++ b/lucene/src/site/src/documentation/content/xdocs/lucene-contrib/index.xml @@ -106,11 +106,6 @@

See spellchecker javadoc

-
wordnet -

Tools to help utilize wordnet synonyms with Lucene

-

See wordnet javadoc

-
-
xml-query-parser

A QueryParser that can read queries written in an XML format.

See xml-query-parser javadoc

diff --git a/lucene/src/site/src/documentation/content/xdocs/site.xml b/lucene/src/site/src/documentation/content/xdocs/site.xml index bf4850eee2d..224e4f347fa 100755 --- a/lucene/src/site/src/documentation/content/xdocs/site.xml +++ b/lucene/src/site/src/documentation/content/xdocs/site.xml @@ -66,7 +66,6 @@ See http://forrest.apache.org/docs/linking.html for more info - @@ -106,7 +105,6 @@ See http://forrest.apache.org/docs/linking.html for more info - diff --git a/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index b5abcc18551..db82596e4ac 100644 --- a/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -260,7 +260,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { default: text = _TestUtil.randomUnicodeString(random, maxWordLength); } - + + if (VERBOSE) { + System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text); + } + TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text)); assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); @@ -286,6 +290,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { ts.close(); // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (!tokens.isEmpty()) { + if (VERBOSE) { + System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis"); + } if (typeAtt != null && posIncAtt != null && offsetAtt != null) { // offset + pos + type assertAnalyzesToReuse(a, text, diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterCommit.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterCommit.java index 563d1c13d24..e6244e915a5 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterCommit.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterCommit.java @@ -31,6 +31,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; @@ -166,6 +167,13 @@ public class TestIndexWriterCommit extends LuceneTestCase { * measure max temp disk space used. */ public void testCommitOnCloseDiskUsage() throws IOException { + // MemoryCodec, since it uses FST, is not necessarily + // "additive", ie if you add up N small FSTs, then merge + // them, the merged result can easily be larger than the + // sum because the merged FST may use array encoding for + // some arcs (which uses more space): + assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("id").equals("Memory")); + assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("content").equals("Memory")); MockDirectoryWrapper dir = newDirectory(); Analyzer analyzer; if (random.nextBoolean()) { diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java index 50febbd5906..5c8f0d58e0a 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterOnDiskFull.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; @@ -142,6 +143,14 @@ public class TestIndexWriterOnDiskFull extends LuceneTestCase { */ public void testAddIndexOnDiskFull() throws IOException { + // MemoryCodec, since it uses FST, is not necessarily + // "additive", ie if you add up N small FSTs, then merge + // them, the merged result can easily be larger than the + // sum because the merged FST may use array encoding for + // some arcs (which uses more space): + assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("id").equals("Memory")); + assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("content").equals("Memory")); + int START_COUNT = 57; int NUM_DIR = TEST_NIGHTLY ? 50 : 5; int END_COUNT = START_COUNT + NUM_DIR* (TEST_NIGHTLY ? 25 : 5); diff --git a/lucene/src/test/org/apache/lucene/util/TestCharsRef.java b/lucene/src/test/org/apache/lucene/util/TestCharsRef.java new file mode 100644 index 00000000000..1852028378d --- /dev/null +++ b/lucene/src/test/org/apache/lucene/util/TestCharsRef.java @@ -0,0 +1,41 @@ +package org.apache.lucene.util; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCharsRef extends LuceneTestCase { + public void testUTF16InUTF8Order() { + final int numStrings = atLeast(1000); + BytesRef utf8[] = new BytesRef[numStrings]; + CharsRef utf16[] = new CharsRef[numStrings]; + + for (int i = 0; i < numStrings; i++) { + String s = _TestUtil.randomUnicodeString(random); + utf8[i] = new BytesRef(s); + utf16[i] = new CharsRef(s); + } + + Arrays.sort(utf8); + Arrays.sort(utf16, CharsRef.getUTF16SortedAsUTF8Comparator()); + + for (int i = 0; i < numStrings; i++) { + assertEquals(utf8[i].utf8ToString(), utf16[i].toString()); + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java new file mode 100644 index 00000000000..7750114e83a --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java @@ -0,0 +1,179 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.LineNumberReader; +import java.io.Reader; +import java.text.ParseException; +import java.util.ArrayList; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.util.CharsRef; + +/** + * Parser for the Solr synonyms format. + *
    + *
  1. Blank lines and lines starting with '#' are comments. + *
  2. Explicit mappings match any token sequence on the LHS of "=>" + * and replace with all alternatives on the RHS. These types of mappings + * ignore the expand parameter in the constructor. + * Example: + *
    i-pod, i pod => ipod
    + *
  3. Equivalent synonyms may be separated with commas and give + * no explicit mapping. In this case the mapping behavior will + * be taken from the expand parameter in the constructor. This allows + * the same synonym file to be used in different synonym handling strategies. + * Example: + *
    ipod, i-pod, i pod
    + * + *
  4. Multiple synonym mapping entries are merged. + * Example: + *
    + * foo => foo bar
    + * foo => baz

    + * is equivalent to

    + * foo => foo bar, baz + *
    + *
+ * @lucene.experimental + */ +public class SolrSynonymParser extends SynonymMap.Builder { + private final boolean expand; + private final Analyzer analyzer; + + public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { + super(dedup); + this.expand = expand; + this.analyzer = analyzer; + } + + public void add(Reader in) throws IOException, ParseException { + LineNumberReader br = new LineNumberReader(in); + try { + addInternal(br); + } catch (IllegalArgumentException e) { + ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); + ex.initCause(e); + throw ex; + } finally { + br.close(); + } + } + + private void addInternal(BufferedReader in) throws IOException { + String line = null; + while ((line = in.readLine()) != null) { + if (line.length() == 0 || line.charAt(0) == '#') { + continue; // ignore empty lines and comments + } + + CharsRef inputs[]; + CharsRef outputs[]; + + // TODO: we could process this more efficiently. + String sides[] = split(line, "=>"); + if (sides.length > 1) { // explicit mapping + if (sides.length != 2) { + throw new IllegalArgumentException("more than one explicit mapping specified on the same line"); + } + String inputStrings[] = split(sides[0], ","); + inputs = new CharsRef[inputStrings.length]; + for (int i = 0; i < inputs.length; i++) { + inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef()); + } + + String outputStrings[] = split(sides[1], ","); + outputs = new CharsRef[outputStrings.length]; + for (int i = 0; i < outputs.length; i++) { + outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef()); + } + } else { + String inputStrings[] = split(line, ","); + inputs = new CharsRef[inputStrings.length]; + for (int i = 0; i < inputs.length; i++) { + inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef()); + } + if (expand) { + outputs = inputs; + } else { + outputs = new CharsRef[1]; + outputs[0] = inputs[0]; + } + } + + // currently we include the term itself in the map, + // and use includeOrig = false always. + // this is how the existing filter does it, but its actually a bug, + // especially if combined with ignoreCase = true + for (int i = 0; i < inputs.length; i++) { + for (int j = 0; j < outputs.length; j++) { + add(inputs[i], outputs[j], false); + } + } + } + } + + private static String[] split(String s, String separator) { + ArrayList list = new ArrayList(2); + StringBuilder sb = new StringBuilder(); + int pos=0, end=s.length(); + while (pos < end) { + if (s.startsWith(separator,pos)) { + if (sb.length() > 0) { + list.add(sb.toString()); + sb=new StringBuilder(); + } + pos+=separator.length(); + continue; + } + + char ch = s.charAt(pos++); + if (ch=='\\') { + sb.append(ch); + if (pos>=end) break; // ERROR, or let it go? + ch = s.charAt(pos++); + } + + sb.append(ch); + } + + if (sb.length() > 0) { + list.add(sb.toString()); + } + + return list.toArray(new String[list.size()]); + } + + private String unescape(String s) { + if (s.indexOf("\\") >= 0) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + char ch = s.charAt(i); + if (ch == '\\' && i < s.length() - 1) { + sb.append(s.charAt(++i)); + } else { + sb.append(ch); + } + } + return sb.toString(); + } + return s; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java index 633156e3101..64827c821c4 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java @@ -1,3 +1,5 @@ +package org.apache.lucene.analysis.synonym; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -15,245 +17,550 @@ * limitations under the License. */ -package org.apache.lucene.analysis.synonym; +import java.io.IOException; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.FST; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.LinkedList; +/** + * Matches single or multi word synonyms in a token stream. + * This token stream cannot properly handle position + * increments != 1, ie, you should place this filter before + * filtering out stop words. + * + *

Note that with the current implementation, parsing is + * greedy, so whenever multiple parses would apply, the rule + * starting the earliest and parsing the most tokens wins. + * For example if you have these rules: + * + *

+ *   a -> x
+ *   a b -> y
+ *   b c d -> z
+ * 
+ * + * Then input a b c d e parses to y b c + * d, ie the 2nd rule "wins" because it started + * earliest and matched the most input tokens of other rules + * starting at that point.

+ * + *

A future improvement to this filter could allow + * non-greedy parsing, such that the 3rd rule would win, and + * also separately allow multiple parses, such that all 3 + * rules would match, perhaps even on a rule by rule + * basis.

+ * + *

NOTE: when a match occurs, the output tokens + * associated with the matching rule are "stacked" on top of + * the input stream (if the rule had + * keepOrig=true) and also on top of aother + * matched rule's output tokens. This is not a correct + * solution, as really the output should be an abitrary + * graph/lattice. For example, with the above match, you + * would expect an exact PhraseQuery "y b + * c" to match the parsed tokens, but it will fail to + * do so. This limitations is necessary because Lucene's + * TokenStream (and index) cannot yet represent an arbitrary + * graph.

+ * + *

NOTE: If multiple incoming tokens arrive on the + * same position, only the first token at that position is + * used for parsing. Subsequent tokens simply pass through + * and are not parsed. A future improvement would be to + * allow these tokens to also be matched.

+ */ + +// TODO: maybe we should resolve token -> wordID then run +// FST on wordIDs, for better perf? + +// TODO: a more efficient approach would be Aho/Corasick's +// algorithm +// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm +// It improves over the current approach here +// because it does not fully re-start matching at every +// token. For exampl,e if one pattern is "a b c x" +// and another is "b c d" and the input is "a b c d", on +// trying to parse "a b c x" but failing when you got to x, +// rather than starting over again your really should +// immediately recognize that "b c d" matches at the next +// input. I suspect this won't matter that much in +// practice, but it's possible on some set of synonyms it +// will. We'd have to modify Aho/Corasick to enforce our +// conflict resolving (eg greedy matching) because that algo +// finds all matches. -/** SynonymFilter handles multi-token synonyms with variable position increment offsets. - *

- * The matched tokens from the input stream may be optionally passed through (includeOrig=true) - * or discarded. If the original tokens are included, the position increments may be modified - * to retain absolute positions after merging with the synonym tokenstream. - *

- * Generated synonyms will start at the same position as the first matched source token. - */ public final class SynonymFilter extends TokenFilter { - private final SynonymMap map; // Map - private Iterator replacement; // iterator over generated tokens + public static final String TYPE_SYNONYM = "SYNONYM"; - public SynonymFilter(TokenStream in, SynonymMap map) { - super(in); - if (map == null) - throw new IllegalArgumentException("map is required"); + private final SynonymMap synonyms; - this.map = map; - // just ensuring these attributes exist... - addAttribute(CharTermAttribute.class); - addAttribute(PositionIncrementAttribute.class); - addAttribute(OffsetAttribute.class); - addAttribute(TypeAttribute.class); + private final boolean ignoreCase; + private final int rollBufferSize; + + private int captureCount; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + // How many future input tokens have already been matched + // to a synonym; because the matching is "greedy" we don't + // try to do any more matching for such tokens: + private int inputSkipCount; + + // Hold all buffered (read ahead) stacked input tokens for + // a future position. When multiple tokens are at the + // same position, we only store (and match against) the + // term for the first token at the position, but capture + // state for (and enumerate) all other tokens at this + // position: + private static class PendingInput { + final CharsRef term = new CharsRef(); + AttributeSource.State state; + boolean keepOrig; + boolean consumed = true; + int startOffset; + int endOffset; + + public void reset() { + state = null; + consumed = true; + keepOrig = false; + } + }; + + // Rolling buffer, holding pending input tokens we had to + // clone because we needed to look ahead, indexed by + // position: + private final PendingInput[] futureInputs; + + // Holds pending output synonyms for one future position: + private static class PendingOutputs { + CharsRef[] outputs; + int upto; + int count; + int posIncr = 1; + + public PendingOutputs() { + outputs = new CharsRef[1]; + } + + public void reset() { + upto = count = 0; + posIncr = 1; + } + + public CharsRef pullNext() { + assert upto < count; + final CharsRef result = outputs[upto++]; + posIncr = 0; + if (upto == count) { + reset(); + } + return result; + } + + public void add(char[] output, int offset, int len) { + if (count == outputs.length) { + final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(outputs, 0, next, 0, count); + outputs = next; + } + if (outputs[count] == null) { + outputs[count] = new CharsRef(); + } + outputs[count].copy(output, offset, len); + count++; + } + }; + + private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + // Rolling buffer, holding stack of pending synonym + // outputs, indexed by position: + private final PendingOutputs[] futureOutputs; + + // Where (in rolling buffers) to write next input saved state: + private int nextWrite; + + // Where (in rolling buffers) to read next input saved state: + private int nextRead; + + // True once we've read last token + private boolean finished; + + private final FST.Arc scratchArc; + + private final FST fst; + + private final BytesRef scratchBytes = new BytesRef(); + private final CharsRef scratchChars = new CharsRef(); + + /** + * @param input input tokenstream + * @param synonyms synonym map + * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. + * Note, if you set this to true, its your responsibility to lowercase + * the input entries when you create the {@link SynonymMap} + */ + public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { + super(input); + this.synonyms = synonyms; + this.ignoreCase = ignoreCase; + this.fst = synonyms.fst; + + if (fst == null) { + throw new IllegalArgumentException("fst must be non-null"); + } + + // Must be 1+ so that when roll buffer is at full + // lookahead we can distinguish this full buffer from + // the empty buffer: + rollBufferSize = 1+synonyms.maxHorizontalContext; + + futureInputs = new PendingInput[rollBufferSize]; + futureOutputs = new PendingOutputs[rollBufferSize]; + for(int pos=0;pos bar - * - need to backtrack - retry matches for tokens already read - * a b c d => foo - * b c => bar - * If the input stream is "a b c x", one will consume "a b c d" - * trying to match the first rule... all but "a" should be - * pushed back so a match may be made on "b c". - * - don't try and match generated tokens (thus need separate queue) - * matching is not recursive. - * - handle optional generation of original tokens in all these cases, - * merging token streams to preserve token positions. - * - preserve original positionIncrement of first matched token - */ - @Override - public boolean incrementToken() throws IOException { - while (true) { - // if there are any generated tokens, return them... don't try any - // matches against them, as we specifically don't want recursion. - if (replacement!=null && replacement.hasNext()) { - copy(this, replacement.next()); - return true; - } + This is the core of this TokenFilter: it locates the + synonym matches and buffers up the results into + futureInputs/Outputs. - // common case fast-path of first token not matching anything - AttributeSource firstTok = nextTok(); - if (firstTok == null) return false; - CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class); - SynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null; - if (result == null) { - copy(this, firstTok); - return true; - } + NOTE: this calls input.incrementToken and does not + capture the state if no further tokens were checked. So + caller must then forward state to our caller, or capture: + */ - // fast-path failed, clone ourselves if needed - if (firstTok == this) - firstTok = cloneAttributes(); - // OK, we matched a token, so find the longest match. + private void parse() throws IOException { + //System.out.println("\nS: parse"); - matched = new LinkedList(); + assert inputSkipCount == 0; - result = match(result); + int curNextRead = nextRead; - if (result==null) { - // no match, simply return the first token read. - copy(this, firstTok); - return true; - } + // Holds the longest match we've seen so far: + BytesRef matchOutput = null; + int matchInputLength = 0; - // reuse, or create new one each time? - ArrayList generated = new ArrayList(result.synonyms.length + matched.size() + 1); + BytesRef pendingOutput = fst.outputs.getNoOutput(); + fst.getFirstArc(scratchArc); - // - // there was a match... let's generate the new tokens, merging - // in the matched tokens (position increments need adjusting) - // - AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast(); - boolean includeOrig = result.includeOrig(); + assert scratchArc.output == fst.outputs.getNoOutput(); - AttributeSource origTok = includeOrig ? firstTok : null; - PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class); - int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream - int repPos=0; // curr position in replacement token stream - int pos=0; // current position in merged token stream + int tokenCount = 0; - for (int i=0; i foo/0 - // should I re-create the gap on the next buffered token? - - replacement = generated.iterator(); - // Now return to the top of the loop to read and return the first - // generated token.. The reason this is done is that we may have generated - // nothing at all, and may need to continue with more matching logic. - } - } - - - // - // Defer creation of the buffer until the first time it is used to - // optimize short fields with no matches. - // - private LinkedList buffer; - private LinkedList matched; - - private boolean exhausted; - - private AttributeSource nextTok() throws IOException { - if (buffer!=null && !buffer.isEmpty()) { - return buffer.removeFirst(); - } else { - if (!exhausted && input.incrementToken()) { - return this; } else { - exhausted = true; - return null; + // Still in our lookahead + buffer = futureInputs[curNextRead].term.chars; + bufferLen = futureInputs[curNextRead].term.length; + //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); + } + + tokenCount++; + + // Run each char in this token through the FST: + int bufUpto = 0; + while(bufUpto < bufferLen) { + final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); + if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) { + //System.out.println(" stop"); + break byToken; + } + + // Accum the output + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); + //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); + bufUpto += Character.charCount(codePoint); + } + + // OK, entire token matched; now see if this is a final + // state: + if (scratchArc.isFinal()) { + matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); + matchInputLength = tokenCount; + //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); + } + + // See if the FST wants to continue matching (ie, needs to + // see the next input token): + if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) { + // No further rules can match here; we're done + // searching for matching rules starting at the + // current input position. + break; + } else { + // More matching is possible -- accum the output (if + // any) of the WORD_SEP arc: + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); + if (nextRead == nextWrite) { + capture(); + } + } + + curNextRead = rollIncr(curNextRead); + } + + if (nextRead == nextWrite && !finished) { + //System.out.println(" skip write slot=" + nextWrite); + nextWrite = rollIncr(nextWrite); + } + + if (matchOutput != null) { + //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); + inputSkipCount = matchInputLength; + addOutput(matchOutput); + } else if (nextRead != nextWrite) { + // Even though we had no match here, we set to 1 + // because we need to skip current input token before + // trying to match again: + inputSkipCount = 1; + } else { + assert finished; + } + + //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); + } + + // Interleaves all output tokens onto the futureOutputs: + private void addOutput(BytesRef bytes) { + bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); + + final int code = bytesReader.readVInt(); + final boolean keepOrig = (code & 0x1) == 0; + final int count = code >>> 1; + //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig); + for(int outputIDX=0;outputIDX ords = new ArrayList(); + } + + /** Sugar: just joins the provided terms with {@link + * SynonymMap#WORD_SEPARATOR}. reuse and its chars + * must not be null. */ + public static CharsRef join(String[] words, CharsRef reuse) { + int upto = 0; + char[] buffer = reuse.chars; + for(String word : words) { + if (upto > 0) { + if (upto >= buffer.length) { + reuse.grow(upto); + buffer = reuse.chars; + } + buffer[upto++] = SynonymMap.WORD_SEPARATOR; + } + + final int wordLen = word.length(); + final int needed = upto + wordLen; + if (needed > buffer.length) { + reuse.grow(needed); + buffer = reuse.chars; + } + + word.getChars(0, wordLen, buffer, upto); + upto += wordLen; + } + + return reuse; + } + + /** Sugar: analyzes the text with the analyzer and + * separates by {@link SynonymMap#WORD_SEPARATOR}. + * reuse and its chars must not be null. */ + public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { + TokenStream ts = analyzer.reusableTokenStream("", new StringReader(text)); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + ts.reset(); + reuse.length = 0; + while (ts.incrementToken()) { + int length = termAtt.length(); + if (length == 0) { + throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); + } + if (posIncAtt.getPositionIncrement() != 1) { + throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1"); + } + reuse.grow(reuse.length + length + 1); /* current + word + separator */ + int end = reuse.offset + reuse.length; + if (reuse.length > 0) { + reuse.chars[end++] = SynonymMap.WORD_SEPARATOR; + reuse.length++; + } + System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); + reuse.length += length; + } + ts.end(); + ts.close(); + if (reuse.length == 0) { + throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); + } + return reuse; + } + + /** only used for asserting! */ + private boolean hasHoles(CharsRef chars) { + final int end = chars.offset + chars.length; + for(int idx=chars.offset+1;idx 0 (got " + numInputWords + ")"); + } + if (input.length <= 0) { + throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")"); + } + if (numOutputWords <= 0) { + throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")"); + } + if (output.length <= 0) { + throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")"); + } + + assert !hasHoles(input): "input has holes: " + input; + assert !hasHoles(output): "output has holes: " + output; + + //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); + final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch); + // lookup in hash + int ord = words.add(utf8Scratch, hashCode); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + //System.out.println(" output=" + output + " old ord=" + ord); + } else { + //System.out.println(" output=" + output + " new ord=" + ord); + } + + MapEntry e = workingSet.get(input); + if (e == null) { + e = new MapEntry(); + workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map + } + + e.ords.add(ord); + e.includeOrig |= includeOrig; + maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords); + maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords); + } + + private int countWords(CharsRef chars) { + int wordCount = 1; + int upto = chars.offset; + final int limit = chars.offset + chars.length; + while(upto < limit) { + if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) { + wordCount++; + } + } + return wordCount; + } + + /** + * Add a phrase->phrase synonym mapping. + * Phrases are character sequences where words are + * separated with character zero (\u0000). Empty words + * (two \u0000s in a row) are not allowed in the input nor + * the output! + * + * @param input input phrase + * @param output output phrase + * @param includeOrig true if the original should be included + */ + public void add(CharsRef input, CharsRef output, boolean includeOrig) { + add(input, countWords(input), output, countWords(output), includeOrig); + } + + /** + * Builds an {@link SynonymMap} and returns it. + */ + public SynonymMap build() throws IOException { + ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + // TODO: are we using the best sharing options? + org.apache.lucene.util.fst.Builder builder = + new org.apache.lucene.util.fst.Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs); + + BytesRef scratch = new BytesRef(64); + ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); + + final Set dedupSet; + + if (dedup) { + dedupSet = new HashSet(); + } else { + dedupSet = null; + } + + final byte[] spare = new byte[5]; + + Set keys = workingSet.keySet(); + CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]); + Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator()); + + //System.out.println("fmap.build"); + for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) { + CharsRef input = sortedKeys[keyIdx]; + MapEntry output = workingSet.get(input); + + int numEntries = output.ords.size(); + // output size, assume the worst case + int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry + + scratch.grow(estimatedSize); + scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length); + assert scratch.offset == 0; + + // now write our output data: + int count = 0; + for (int i = 0; i < numEntries; i++) { + if (dedupSet != null) { + // box once + final Integer ent = output.ords.get(i); + if (dedupSet.contains(ent)) { + continue; + } + dedupSet.add(ent); + } + scratchOutput.writeVInt(output.ords.get(i)); + count++; + } + + final int pos = scratchOutput.getPosition(); + scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); + final int pos2 = scratchOutput.getPosition(); + final int vIntLen = pos2-pos; + + // Move the count + includeOrig to the front of the byte[]: + System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen); + System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos); + System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen); + + if (dedupSet != null) { + dedupSet.clear(); + } + + scratch.length = scratchOutput.getPosition() - scratch.offset; + //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); + builder.add(input, new BytesRef(scratch)); + } + + FST fst = builder.finish(); + return new SynonymMap(fst, words, maxHorizontalContext); } - List superset = currMap.synonyms==null ? replacement : - mergeTokens(Arrays.asList(currMap.synonyms), replacement); - currMap.synonyms = superset.toArray(new Token[superset.size()]); - if (includeOrig) currMap.flags |= INCLUDE_ORIG; } - - - @Override - public String toString() { - StringBuilder sb = new StringBuilder("<"); - if (synonyms!=null) { - sb.append("["); - for (int i=0; i"); - return sb.toString(); - } - - - - /** Produces a List from a List */ - public static List makeTokens(List strings) { - List ret = new ArrayList(strings.size()); - for (String str : strings) { - //Token newTok = new Token(str,0,0,"SYNONYM"); - Token newTok = new Token(str, 0,0,"SYNONYM"); - ret.add(newTok); - } - return ret; - } - - - /** - * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that - * the tokens end up at the same position. - * - * Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position) - * Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n) - * - */ - public static List mergeTokens(List lst1, List lst2) { - ArrayList result = new ArrayList(); - if (lst1 ==null || lst2 ==null) { - if (lst2 != null) result.addAll(lst2); - if (lst1 != null) result.addAll(lst1); - return result; - } - - int pos=0; - Iterator iter1=lst1.iterator(); - Iterator iter2=lst2.iterator(); - Token tok1 = iter1.hasNext() ? iter1.next() : null; - Token tok2 = iter2.hasNext() ? iter2.next() : null; - int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0; - int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0; - while(tok1!=null || tok2!=null) { - while (tok1 != null && (pos1 <= pos2 || tok2==null)) { - Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type()); - tok.copyBuffer(tok1.buffer(), 0, tok1.length()); - tok.setPositionIncrement(pos1-pos); - result.add(tok); - pos=pos1; - tok1 = iter1.hasNext() ? iter1.next() : null; - pos1 += tok1!=null ? tok1.getPositionIncrement() : 0; - } - while (tok2 != null && (pos2 <= pos1 || tok1==null)) { - Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type()); - tok.copyBuffer(tok2.buffer(), 0, tok2.length()); - tok.setPositionIncrement(pos2-pos); - result.add(tok); - pos=pos2; - tok2 = iter2.hasNext() ? iter2.next() : null; - pos2 += tok2!=null ? tok2.getPositionIncrement() : 0; - } - } - return result; - } - } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java new file mode 100644 index 00000000000..20aeea0e362 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/WordnetSynonymParser.java @@ -0,0 +1,112 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.LineNumberReader; +import java.io.Reader; +import java.text.ParseException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.util.CharsRef; + +/** + * Parser for wordnet prolog format + *

+ * See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format. + * @lucene.experimental + */ +// TODO: allow you to specify syntactic categories (e.g. just nouns, etc) +public class WordnetSynonymParser extends SynonymMap.Builder { + private final boolean expand; + private final Analyzer analyzer; + + public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { + super(dedup); + this.expand = expand; + this.analyzer = analyzer; + } + + public void add(Reader in) throws IOException, ParseException { + LineNumberReader br = new LineNumberReader(in); + try { + String line = null; + String lastSynSetID = ""; + CharsRef synset[] = new CharsRef[8]; + int synsetSize = 0; + + while ((line = br.readLine()) != null) { + String synSetID = line.substring(2, 11); + + if (!synSetID.equals(lastSynSetID)) { + addInternal(synset, synsetSize); + synsetSize = 0; + } + + if (synset.length <= synsetSize+1) { + CharsRef larger[] = new CharsRef[synset.length * 2]; + System.arraycopy(synset, 0, larger, 0, synsetSize); + synset = larger; + } + + synset[synsetSize] = parseSynonym(line, synset[synsetSize]); + synsetSize++; + lastSynSetID = synSetID; + } + + // final synset in the file + addInternal(synset, synsetSize); + } catch (IllegalArgumentException e) { + ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); + ex.initCause(e); + throw ex; + } finally { + br.close(); + } + } + + private CharsRef parseSynonym(String line, CharsRef reuse) throws IOException { + if (reuse == null) { + reuse = new CharsRef(8); + } + + int start = line.indexOf('\'')+1; + int end = line.lastIndexOf('\''); + + String text = line.substring(start, end).replace("''", "'"); + return analyze(analyzer, text, reuse); + } + + private void addInternal(CharsRef synset[], int size) throws IOException { + if (size <= 1) { + return; // nothing to do + } + + if (expand) { + for (int i = 0; i < size; i++) { + for (int j = 0; j < size; j++) { + add(synset[i], synset[j], false); + } + } + } else { + for (int i = 0; i < size; i++) { + add(synset[i], synset[0], false); + } + } + } +} diff --git a/lucene/contrib/wordnet/src/java/overview.html b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/package.html similarity index 84% rename from lucene/contrib/wordnet/src/java/overview.html rename to modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/package.html index cd05399880b..2fd37e8de20 100644 --- a/lucene/contrib/wordnet/src/java/overview.html +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/package.html @@ -1,3 +1,4 @@ + - - - - wordnet - - - - wordnet - - \ No newline at end of file + + +Analysis components for Synonyms. + + diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java new file mode 100644 index 00000000000..6260a3d1618 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java @@ -0,0 +1,144 @@ +package org.apache.lucene.analysis.synonym; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.text.ParseException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.junit.Test; + +/** + * Tests parser for the Solr synonyms format + * @lucene.experimental + */ +public class TestSolrSynonymParser extends BaseTokenStreamTestCase { + + /** Tests some simple examples from the solr wiki */ + public void testSimple() throws Exception { + String testFile = + "i-pod, ipod, ipoooood\n" + + "foo => foo bar\n" + + "foo => baz\n" + + "this test, that testing"; + + SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random)); + parser.add(new StringReader(testFile)); + final SynonymMap map = parser.build(); + + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); + } + }; + + assertAnalyzesTo(analyzer, "ball", + new String[] { "ball" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "i-pod", + new String[] { "i-pod", "ipod", "ipoooood" }, + new int[] { 1, 0, 0 }); + + assertAnalyzesTo(analyzer, "foo", + new String[] { "foo", "baz", "bar" }, + new int[] { 1, 0, 1 }); + + assertAnalyzesTo(analyzer, "this test", + new String[] { "this", "that", "test", "testing" }, + new int[] { 1, 0, 1, 0 }); + } + + /** parse a syn file with bad syntax */ + @Test(expected=ParseException.class) + public void testInvalidDoubleMap() throws Exception { + String testFile = "a => b => c"; + SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random)); + parser.add(new StringReader(testFile)); + } + + /** parse a syn file with bad syntax */ + @Test(expected=ParseException.class) + public void testInvalidAnalyzesToNothingOutput() throws Exception { + String testFile = "a => 1"; + SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false)); + parser.add(new StringReader(testFile)); + } + + /** parse a syn file with bad syntax */ + @Test(expected=ParseException.class) + public void testInvalidAnalyzesToNothingInput() throws Exception { + String testFile = "1 => a"; + SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false)); + parser.add(new StringReader(testFile)); + } + + /** parse a syn file with bad syntax */ + @Test(expected=ParseException.class) + public void testInvalidPositionsInput() throws Exception { + String testFile = "testola => the test"; + SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT)); + parser.add(new StringReader(testFile)); + } + + /** parse a syn file with bad syntax */ + @Test(expected=ParseException.class) + public void testInvalidPositionsOutput() throws Exception { + String testFile = "the test => testola"; + SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT)); + parser.add(new StringReader(testFile)); + } + + /** parse a syn file with some escaped syntax chars */ + public void testEscapedStuff() throws Exception { + String testFile = + "a\\=>a => b\\=>b\n" + + "a\\,a => b\\,b"; + SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.KEYWORD, false)); + parser.add(new StringReader(testFile)); + final SynonymMap map = parser.build(); + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); + } + }; + + assertAnalyzesTo(analyzer, "ball", + new String[] { "ball" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "a=>a", + new String[] { "b=>b" }, + new int[] { 1 }); + + assertAnalyzesTo(analyzer, "a,a", + new String[] { "b,b" }, + new int[] { 1 }); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java new file mode 100644 index 00000000000..ba1b23f5c6b --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java @@ -0,0 +1,393 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util._TestUtil; + +public class TestSynonymMapFilter extends BaseTokenStreamTestCase { + + private SynonymMap.Builder b; + private Tokenizer tokensIn; + private SynonymFilter tokensOut; + private CharTermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + private OffsetAttribute offsetAtt; + + private void add(String input, String output, boolean keepOrig) { + b.add(new CharsRef(input.replaceAll(" +", "\u0000")), + new CharsRef(output.replaceAll(" +", "\u0000")), + keepOrig); + } + + private void assertEquals(CharTermAttribute term, String expected) { + assertEquals(expected.length(), term.length()); + final char[] buffer = term.buffer(); + for(int chIDX=0;chIDX 0) { + assertTrue(tokensOut.incrementToken()); + if (VERBOSE) { + System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement()); + } + } + assertEquals(termAtt, expectedAtPos[atPos]); + assertEquals(atPos == 0 ? 1 : 0, + posIncrAtt.getPositionIncrement()); + // start/end offset of all tokens at same pos should + // be the same: + assertEquals(startOffset, offsetAtt.startOffset()); + assertEquals(endOffset, offsetAtt.endOffset()); + } + } + tokensOut.end(); + tokensOut.close(); + if (VERBOSE) { + System.out.println(" incr: END"); + } + assertEquals(expectedUpto, expected.length); + } + + public void testBasic() throws Exception { + b = new SynonymMap.Builder(true); + add("a", "foo", true); + add("a b", "bar fee", true); + add("b c", "dog collar", true); + add("c d", "dog harness holder extras", true); + add("m c e", "dog barks loudly", false); + + add("e f", "foo bar", false); + add("e f", "baz bee", false); + + add("z", "boo", false); + add("y", "bee", true); + + tokensIn = new MockTokenizer(new StringReader("a"), + MockTokenizer.WHITESPACE, + true); + tokensIn.reset(); + assertTrue(tokensIn.incrementToken()); + assertFalse(tokensIn.incrementToken()); + tokensIn.end(); + tokensIn.close(); + + tokensOut = new SynonymFilter(tokensIn, + b.build(), + true); + termAtt = tokensOut.addAttribute(CharTermAttribute.class); + posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); + offsetAtt = tokensOut.addAttribute(OffsetAttribute.class); + + verify("a b c", "a/bar b/fee c"); + + // syn output extends beyond input tokens + verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras"); + + verify("a b a", "a/bar b/fee a/foo"); + + // outputs that add to one another: + verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras"); + + // two outputs for same input + verify("e f", "foo/baz bar/bee"); + + // mixed keepOrig true/false: + verify("a m c e x", "a/foo dog barks loudly x"); + verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x"); + assertTrue(tokensOut.getCaptureCount() > 0); + + // no captureStates when no syns matched + verify("p q r s t", "p q r s t"); + assertEquals(0, tokensOut.getCaptureCount()); + + // no captureStates when only single-input syns, w/ no + // lookahead needed, matched + verify("p q z y t", "p q boo y/bee t"); + assertEquals(0, tokensOut.getCaptureCount()); + } + + private String getRandomString(char start, int alphabetSize, int length) { + assert alphabetSize <= 26; + char[] s = new char[2*length]; + for(int charIDX=0;charIDX out; + boolean keepOrig; + } + + public String slowSynMatcher(String doc, List syns, int maxOutputLength) { + assertTrue(doc.length() % 2 == 0); + final int numInputs = doc.length()/2; + boolean[] keepOrigs = new boolean[numInputs]; + Arrays.fill(keepOrigs, false); + String[] outputs = new String[numInputs + maxOutputLength]; + OneSyn[] matches = new OneSyn[numInputs]; + for(OneSyn syn : syns) { + int idx = -1; + while(true) { + idx = doc.indexOf(syn.in, 1+idx); + if (idx == -1) { + break; + } + assertTrue(idx % 2 == 0); + final int matchIDX = idx/2; + assertTrue(syn.in.length() % 2 == 1); + if (matches[matchIDX] == null) { + matches[matchIDX] = syn; + } else if (syn.in.length() > matches[matchIDX].in.length()) { + // Greedy conflict resolution: longer match wins: + matches[matchIDX] = syn; + } else { + assertTrue(syn.in.length() < matches[matchIDX].in.length()); + } + } + } + + // Greedy conflict resolution: if syn matches a range of inputs, + // it prevents other syns from matching that range + for(int inputIDX=0;inputIDX= numInputs && outputs[inputIDX] == null) { + break; + } + if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) { + sb.append(inputTokens[inputIDX]); + posHasOutput = true; + } + + if (outputs[inputIDX] != null) { + if (posHasOutput) { + sb.append('/'); + } + sb.append(outputs[inputIDX]); + } + if (inputIDX < limit-1) { + sb.append(' '); + } + } + + return sb.toString(); + } + + public void testRandom() throws Exception { + + final int alphabetSize = _TestUtil.nextInt(random, 2, 7); + + final int docLen = atLeast(3000); + //final int docLen = 50; + + final String document = getRandomString('a', alphabetSize, docLen); + + if (VERBOSE) { + System.out.println("TEST: doc=" + document); + } + + final int numSyn = atLeast(5); + //final int numSyn = 2; + + final Map synMap = new HashMap(); + final List syns = new ArrayList(); + final boolean dedup = random.nextBoolean(); + if (VERBOSE) { + System.out.println(" dedup=" + dedup); + } + b = new SynonymMap.Builder(dedup); + for(int synIDX=0;synIDX(); + synMap.put(synIn, s); + s.keepOrig = random.nextBoolean(); + } + final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim(); + s.out.add(synOut); + add(synIn, synOut, s.keepOrig); + if (VERBOSE) { + System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig); + } + } + + tokensIn = new MockTokenizer(new StringReader("a"), + MockTokenizer.WHITESPACE, + true); + tokensIn.reset(); + assertTrue(tokensIn.incrementToken()); + assertFalse(tokensIn.incrementToken()); + tokensIn.end(); + tokensIn.close(); + + tokensOut = new SynonymFilter(tokensIn, + b.build(), + true); + termAtt = tokensOut.addAttribute(CharTermAttribute.class); + posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); + offsetAtt = tokensOut.addAttribute(OffsetAttribute.class); + + if (dedup) { + pruneDups(syns); + } + + final String expected = slowSynMatcher(document, syns, 5); + + if (VERBOSE) { + System.out.println("TEST: expected=" + expected); + } + + verify(document, expected); + } + + private void pruneDups(List syns) { + Set seen = new HashSet(); + for(OneSyn syn : syns) { + int idx = 0; + while(idx < syn.out.size()) { + String out = syn.out.get(idx); + if (!seen.contains(out)) { + seen.add(out); + idx++; + } else { + syn.out.remove(idx); + } + } + seen.clear(); + } + } + + private String randomNonEmptyString() { + while(true) { + final String s = _TestUtil.randomUnicodeString(random).trim(); + if (s.length() != 0 && s.indexOf('\u0000') == -1) { + return s; + } + } + } + + /** simple random test, doesn't verify correctness. + * does verify it doesnt throw exceptions, or that the stream doesn't misbehave + */ + public void testRandom2() throws Exception { + final int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + b = new SynonymMap.Builder(random.nextBoolean()); + final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); + } + final SynonymMap map = b.build(); + final boolean ignoreCase = random.nextBoolean(); + + final Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); + } + }; + + checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER); + } + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java new file mode 100644 index 00000000000..6f1c6329afb --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestWordnetSynonymParser.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +public class TestWordnetSynonymParser extends BaseTokenStreamTestCase { + Analyzer analyzer; + + String synonymsFile = + "s(100000001,1,'woods',n,1,0).\n" + + "s(100000001,2,'wood',n,1,0).\n" + + "s(100000001,3,'forest',n,1,0).\n" + + "s(100000002,1,'wolfish',n,1,0).\n" + + "s(100000002,2,'ravenous',n,1,0).\n" + + "s(100000003,1,'king',n,1,1).\n" + + "s(100000003,2,'baron',n,1,1).\n" + + "s(100000004,1,'king''s evil',n,1,1).\n" + + "s(100000004,2,'king''s meany',n,1,1).\n"; + + public void testSynonyms() throws Exception { + WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random)); + parser.add(new StringReader(synonymsFile)); + final SynonymMap map = parser.build(); + + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); + } + }; + + /* all expansions */ + assertAnalyzesTo(analyzer, "Lost in the woods", + new String[] { "Lost", "in", "the", "woods", "wood", "forest" }, + new int[] { 0, 5, 8, 12, 12, 12 }, + new int[] { 4, 7, 11, 17, 17, 17 }, + new int[] { 1, 1, 1, 1, 0, 0 }); + + /* single quote */ + assertAnalyzesTo(analyzer, "king", + new String[] { "king", "baron" }); + + /* multi words */ + assertAnalyzesTo(analyzer, "king's evil", + new String[] { "king's", "king's", "evil", "meany" }); + } +} diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java index 48b5d251d85..317090863eb 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java @@ -90,6 +90,10 @@ import org.apache.lucene.store.OutputStreamDataOutput; * *

"alphabetically" in any of the documentation above indicates utf16 codepoint order, * nothing else. + * + * NOTE: the FST file format is experimental and + * subject to suddenly change, requiring you to rebuild the + * FST suggest index. */ public class FSTLookup extends Lookup { diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index e0f2c21ea5a..653bcfa58e6 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -320,6 +320,9 @@ New Features Optimizations ---------------------- +* LUCENE-3233: Improved memory usage, build time, and performance of + SynonymFilterFactory. (Mike McCandless, Robert Muir) + Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/analysis/FSTSynonymFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/FSTSynonymFilterFactory.java new file mode 100644 index 00000000000..151f5a9b623 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/analysis/FSTSynonymFilterFactory.java @@ -0,0 +1,157 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.text.ParseException; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.synonym.SynonymFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; +import org.apache.lucene.analysis.synonym.WordnetSynonymParser; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.util.Version; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.util.plugin.ResourceLoaderAware; + +/** + * @deprecated (3.4) use {@link SynonymFilterFactory} instead. this is only a backwards compatibility + * mechanism that will be removed in Lucene 5.0 + */ +// NOTE: rename this to "SynonymFilterFactory" and nuke that delegator in Lucene 5.0! +@Deprecated +final class FSTSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + private SynonymMap map; + private boolean ignoreCase; + + @Override + public TokenStream create(TokenStream input) { + return new SynonymFilter(input, map, ignoreCase); + } + + @Override + public void inform(ResourceLoader loader) { + final boolean ignoreCase = getBoolean("ignoreCase", false); + this.ignoreCase = ignoreCase; + + String tf = args.get("tokenizerFactory"); + + final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf, args); + + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader); + TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer; + return new TokenStreamComponents(tokenizer, stream); + } + }; + + String format = args.get("format"); + try { + if (format == null || format.equals("solr")) { + // TODO: expose dedup as a parameter? + map = loadSolrSynonyms(loader, true, analyzer); + } else if (format.equals("wordnet")) { + map = loadWordnetSynonyms(loader, true, analyzer); + } else { + // TODO: somehow make this more pluggable + throw new RuntimeException("Unrecognized synonyms format: " + format); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Load synonyms from the solr format, "format=solr". + */ + private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { + final boolean expand = getBoolean("expand", true); + String synonyms = args.get("synonyms"); + if (synonyms == null) + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'."); + + CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer); + File synonymFile = new File(synonyms); + if (synonymFile.exists()) { + decoder.reset(); + parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); + } else { + List files = StrUtils.splitFileNames(synonyms); + for (String file : files) { + decoder.reset(); + parser.add(new InputStreamReader(loader.openResource(file), decoder)); + } + } + return parser.build(); + } + + /** + * Load synonyms from the wordnet format, "format=wordnet". + */ + private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { + final boolean expand = getBoolean("expand", true); + String synonyms = args.get("synonyms"); + if (synonyms == null) + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'."); + + CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer); + File synonymFile = new File(synonyms); + if (synonymFile.exists()) { + decoder.reset(); + parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); + } else { + List files = StrUtils.splitFileNames(synonyms); + for (String file : files) { + decoder.reset(); + parser.add(new InputStreamReader(loader.openResource(file), decoder)); + } + } + return parser.build(); + } + + private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){ + TokenizerFactory tokFactory = (TokenizerFactory) loader.newInstance(cname); + tokFactory.init(args); + return tokFactory; + } +} diff --git a/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilter.java b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilter.java new file mode 100644 index 00000000000..d97cacda7b6 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilter.java @@ -0,0 +1,261 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedList; + +/** SynonymFilter handles multi-token synonyms with variable position increment offsets. + *

+ * The matched tokens from the input stream may be optionally passed through (includeOrig=true) + * or discarded. If the original tokens are included, the position increments may be modified + * to retain absolute positions after merging with the synonym tokenstream. + *

+ * Generated synonyms will start at the same position as the first matched source token. + * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0 + */ +@Deprecated +final class SlowSynonymFilter extends TokenFilter { + + private final SlowSynonymMap map; // Map + private Iterator replacement; // iterator over generated tokens + + public SlowSynonymFilter(TokenStream in, SlowSynonymMap map) { + super(in); + if (map == null) + throw new IllegalArgumentException("map is required"); + + this.map = map; + // just ensuring these attributes exist... + addAttribute(CharTermAttribute.class); + addAttribute(PositionIncrementAttribute.class); + addAttribute(OffsetAttribute.class); + addAttribute(TypeAttribute.class); + } + + + /* + * Need to worry about multiple scenarios: + * - need to go for the longest match + * a b => foo #shouldn't match if "a b" is followed by "c d" + * a b c d => bar + * - need to backtrack - retry matches for tokens already read + * a b c d => foo + * b c => bar + * If the input stream is "a b c x", one will consume "a b c d" + * trying to match the first rule... all but "a" should be + * pushed back so a match may be made on "b c". + * - don't try and match generated tokens (thus need separate queue) + * matching is not recursive. + * - handle optional generation of original tokens in all these cases, + * merging token streams to preserve token positions. + * - preserve original positionIncrement of first matched token + */ + @Override + public boolean incrementToken() throws IOException { + while (true) { + // if there are any generated tokens, return them... don't try any + // matches against them, as we specifically don't want recursion. + if (replacement!=null && replacement.hasNext()) { + copy(this, replacement.next()); + return true; + } + + // common case fast-path of first token not matching anything + AttributeSource firstTok = nextTok(); + if (firstTok == null) return false; + CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class); + SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null; + if (result == null) { + copy(this, firstTok); + return true; + } + + // fast-path failed, clone ourselves if needed + if (firstTok == this) + firstTok = cloneAttributes(); + // OK, we matched a token, so find the longest match. + + matched = new LinkedList(); + + result = match(result); + + if (result==null) { + // no match, simply return the first token read. + copy(this, firstTok); + return true; + } + + // reuse, or create new one each time? + ArrayList generated = new ArrayList(result.synonyms.length + matched.size() + 1); + + // + // there was a match... let's generate the new tokens, merging + // in the matched tokens (position increments need adjusting) + // + AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast(); + boolean includeOrig = result.includeOrig(); + + AttributeSource origTok = includeOrig ? firstTok : null; + PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class); + int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream + int repPos=0; // curr position in replacement token stream + int pos=0; // current position in merged token stream + + for (int i=0; i foo/0 + // should I re-create the gap on the next buffered token? + + replacement = generated.iterator(); + // Now return to the top of the loop to read and return the first + // generated token.. The reason this is done is that we may have generated + // nothing at all, and may need to continue with more matching logic. + } + } + + + // + // Defer creation of the buffer until the first time it is used to + // optimize short fields with no matches. + // + private LinkedList buffer; + private LinkedList matched; + + private boolean exhausted; + + private AttributeSource nextTok() throws IOException { + if (buffer!=null && !buffer.isEmpty()) { + return buffer.removeFirst(); + } else { + if (!exhausted && input.incrementToken()) { + return this; + } else { + exhausted = true; + return null; + } + } + } + + private void pushTok(AttributeSource t) { + if (buffer==null) buffer=new LinkedList(); + buffer.addFirst(t); + } + + private SlowSynonymMap match(SlowSynonymMap map) throws IOException { + SlowSynonymMap result = null; + + if (map.submap != null) { + AttributeSource tok = nextTok(); + if (tok != null) { + // clone ourselves. + if (tok == this) + tok = cloneAttributes(); + // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level? + CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); + SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length()); + + if (subMap != null) { + // recurse + result = match(subMap); + } + + if (result != null) { + matched.addFirst(tok); + } else { + // push back unmatched token + pushTok(tok); + } + } + } + + // if no longer sequence matched, so if this node has synonyms, it's the match. + if (result==null && map.synonyms!=null) { + result = map; + } + + return result; + } + + private void copy(AttributeSource target, AttributeSource source) { + if (target != source) + source.copyTo(target); + } + + @Override + public void reset() throws IOException { + input.reset(); + replacement = null; + exhausted = false; + } +} diff --git a/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilterFactory.java new file mode 100644 index 00000000000..3390d0d53c0 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymFilterFactory.java @@ -0,0 +1,188 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.util.plugin.ResourceLoaderAware; + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * Factory for {@link SlowSynonymFilter} (only used with luceneMatchVersion < 3.4) + *

+ * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
+ *             expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0 + */ +@Deprecated +final class SlowSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + + public void inform(ResourceLoader loader) { + String synonyms = args.get("synonyms"); + if (synonyms == null) + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'."); + boolean ignoreCase = getBoolean("ignoreCase", false); + boolean expand = getBoolean("expand", true); + + String tf = args.get("tokenizerFactory"); + TokenizerFactory tokFactory = null; + if( tf != null ){ + tokFactory = loadTokenizerFactory( loader, tf, args ); + } + + Iterable wlist=loadRules( synonyms, loader ); + + synMap = new SlowSynonymMap(ignoreCase); + parseRules(wlist, synMap, "=>", ",", expand,tokFactory); + } + + /** + * @return a list of all rules + */ + protected Iterable loadRules( String synonyms, ResourceLoader loader ) { + List wlist=null; + try { + File synonymFile = new File(synonyms); + if (synonymFile.exists()) { + wlist = loader.getLines(synonyms); + } else { + List files = StrUtils.splitFileNames(synonyms); + wlist = new ArrayList(); + for (String file : files) { + List lines = loader.getLines(file.trim()); + wlist.addAll(lines); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + return wlist; + } + + private SlowSynonymMap synMap; + + static void parseRules(Iterable rules, SlowSynonymMap map, String mappingSep, + String synSep, boolean expansion, TokenizerFactory tokFactory) { + int count=0; + for (String rule : rules) { + // To use regexes, we need an expression that specifies an odd number of chars. + // This can't really be done with string.split(), and since we need to + // do unescaping at some point anyway, we wouldn't be saving any effort + // by using regexes. + + List mapping = StrUtils.splitSmart(rule, mappingSep, false); + + List> source; + List> target; + + if (mapping.size() > 2) { + throw new RuntimeException("Invalid Synonym Rule:" + rule); + } else if (mapping.size()==2) { + source = getSynList(mapping.get(0), synSep, tokFactory); + target = getSynList(mapping.get(1), synSep, tokFactory); + } else { + source = getSynList(mapping.get(0), synSep, tokFactory); + if (expansion) { + // expand to all arguments + target = source; + } else { + // reduce to first argument + target = new ArrayList>(1); + target.add(source.get(0)); + } + } + + boolean includeOrig=false; + for (List fromToks : source) { + count++; + for (List toToks : target) { + map.add(fromToks, + SlowSynonymMap.makeTokens(toToks), + includeOrig, + true + ); + } + } + } + } + + // a , b c , d e f => [[a],[b,c],[d,e,f]] + private static List> getSynList(String str, String separator, TokenizerFactory tokFactory) { + List strList = StrUtils.splitSmart(str, separator, false); + // now split on whitespace to get a list of token strings + List> synList = new ArrayList>(); + for (String toks : strList) { + List tokList = tokFactory == null ? + StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory); + synList.add(tokList); + } + return synList; + } + + private static List splitByTokenizer(String source, TokenizerFactory tokFactory){ + StringReader reader = new StringReader( source ); + TokenStream ts = loadTokenizer(tokFactory, reader); + List tokList = new ArrayList(); + try { + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + while (ts.incrementToken()){ + if( termAtt.length() > 0 ) + tokList.add( termAtt.toString() ); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + finally{ + reader.close(); + } + return tokList; + } + + private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){ + TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname ); + tokFactory.init( args ); + return tokFactory; + } + + private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){ + return tokFactory.create( reader ); + } + + public SlowSynonymMap getSynonymMap() { + return synMap; + } + + public SlowSynonymFilter create(TokenStream input) { + return new SlowSynonymFilter(input,synMap); + } +} diff --git a/solr/core/src/java/org/apache/solr/analysis/SlowSynonymMap.java b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymMap.java new file mode 100644 index 00000000000..21570ae4438 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/analysis/SlowSynonymMap.java @@ -0,0 +1,162 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.util.CharArrayMap; +import org.apache.lucene.util.Version; + +import java.util.*; + +/** Mapping rules for use with {@link SlowSynonymFilter} + * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0 + */ +@Deprecated +class SlowSynonymMap { + /** @lucene.internal */ + public CharArrayMap submap; // recursive: Map + /** @lucene.internal */ + public Token[] synonyms; + int flags; + + static final int INCLUDE_ORIG=0x01; + static final int IGNORE_CASE=0x02; + + public SlowSynonymMap() {} + public SlowSynonymMap(boolean ignoreCase) { + if (ignoreCase) flags |= IGNORE_CASE; + } + + public boolean includeOrig() { return (flags & INCLUDE_ORIG) != 0; } + public boolean ignoreCase() { return (flags & IGNORE_CASE) != 0; } + + /** + * @param singleMatch List, the sequence of strings to match + * @param replacement List the list of tokens to use on a match + * @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens + * @param mergeExisting merge the replacement tokens with any other mappings that exist + */ + public void add(List singleMatch, List replacement, boolean includeOrig, boolean mergeExisting) { + SlowSynonymMap currMap = this; + for (String str : singleMatch) { + if (currMap.submap==null) { + // for now hardcode at 4.0, as its what the old code did. + // would be nice to fix, but shouldn't store a version in each submap!!! + currMap.submap = new CharArrayMap(Version.LUCENE_40, 1, ignoreCase()); + } + + SlowSynonymMap map = currMap.submap.get(str); + if (map==null) { + map = new SlowSynonymMap(); + map.flags |= flags & IGNORE_CASE; + currMap.submap.put(str, map); + } + + currMap = map; + } + + if (currMap.synonyms != null && !mergeExisting) { + throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch); + } + List superset = currMap.synonyms==null ? replacement : + mergeTokens(Arrays.asList(currMap.synonyms), replacement); + currMap.synonyms = superset.toArray(new Token[superset.size()]); + if (includeOrig) currMap.flags |= INCLUDE_ORIG; + } + + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("<"); + if (synonyms!=null) { + sb.append("["); + for (int i=0; i"); + return sb.toString(); + } + + + + /** Produces a List from a List */ + public static List makeTokens(List strings) { + List ret = new ArrayList(strings.size()); + for (String str : strings) { + //Token newTok = new Token(str,0,0,"SYNONYM"); + Token newTok = new Token(str, 0,0,"SYNONYM"); + ret.add(newTok); + } + return ret; + } + + + /** + * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that + * the tokens end up at the same position. + * + * Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position) + * Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n) + * + */ + public static List mergeTokens(List lst1, List lst2) { + ArrayList result = new ArrayList(); + if (lst1 ==null || lst2 ==null) { + if (lst2 != null) result.addAll(lst2); + if (lst1 != null) result.addAll(lst1); + return result; + } + + int pos=0; + Iterator iter1=lst1.iterator(); + Iterator iter2=lst2.iterator(); + Token tok1 = iter1.hasNext() ? iter1.next() : null; + Token tok2 = iter2.hasNext() ? iter2.next() : null; + int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0; + int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0; + while(tok1!=null || tok2!=null) { + while (tok1 != null && (pos1 <= pos2 || tok2==null)) { + Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type()); + tok.copyBuffer(tok1.buffer(), 0, tok1.length()); + tok.setPositionIncrement(pos1-pos); + result.add(tok); + pos=pos1; + tok1 = iter1.hasNext() ? iter1.next() : null; + pos1 += tok1!=null ? tok1.getPositionIncrement() : 0; + } + while (tok2 != null && (pos2 <= pos1 || tok1==null)) { + Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type()); + tok.copyBuffer(tok2.buffer(), 0, tok2.length()); + tok.setPositionIncrement(pos2-pos); + result.add(tok); + pos=pos2; + tok2 = iter2.hasNext() ? iter2.next() : null; + pos2 += tok2!=null ? tok2.getPositionIncrement() : 0; + } + } + return result; + } + +} diff --git a/solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java index 3b8e4802b7d..d95fd1855b2 100644 --- a/solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java @@ -1,189 +1,54 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - package org.apache.solr.analysis; +import java.util.Map; + import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.synonym.SynonymFilter; -import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Version; import org.apache.solr.common.ResourceLoader; -import org.apache.solr.common.SolrException; -import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; -import java.io.File; -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - /** * Factory for {@link SynonymFilter}. *
  * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
  *   <analyzer>
  *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
- *             expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
+ *             format="solr" ignoreCase="false" expand="true" 
+ *             tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
  *   </analyzer>
  * </fieldType>
- * */ public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + private BaseTokenFilterFactory delegator; + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + if (luceneMatchVersion.onOrAfter(Version.LUCENE_34)) { + delegator = new FSTSynonymFilterFactory(); + } else { + // check if you use the new optional arg "format". this makes no sense for the old one, + // as its wired to solr's synonyms format only. + if (args.containsKey("format") && !args.get("format").equals("solr")) { + throw new IllegalArgumentException("You must specify luceneMatchVersion >= 3.4 to use alternate synonyms formats"); + } + delegator = new SlowSynonymFilterFactory(); + } + delegator.init(args); + } + + @Override + public TokenStream create(TokenStream input) { + assert delegator != null : "init() was not called!"; + return delegator.create(input); + } + + @Override public void inform(ResourceLoader loader) { - String synonyms = args.get("synonyms"); - if (synonyms == null) - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'."); - boolean ignoreCase = getBoolean("ignoreCase", false); - boolean expand = getBoolean("expand", true); - - String tf = args.get("tokenizerFactory"); - TokenizerFactory tokFactory = null; - if( tf != null ){ - tokFactory = loadTokenizerFactory( loader, tf, args ); - } - - Iterable wlist=loadRules( synonyms, loader ); - - synMap = new SynonymMap(ignoreCase); - parseRules(wlist, synMap, "=>", ",", expand,tokFactory); - } - - /** - * @return a list of all rules - */ - protected Iterable loadRules( String synonyms, ResourceLoader loader ) { - List wlist=null; - try { - File synonymFile = new File(synonyms); - if (synonymFile.exists()) { - wlist = loader.getLines(synonyms); - } else { - List files = StrUtils.splitFileNames(synonyms); - wlist = new ArrayList(); - for (String file : files) { - List lines = loader.getLines(file.trim()); - wlist.addAll(lines); - } - } - } catch (IOException e) { - throw new RuntimeException(e); - } - return wlist; - } - - private SynonymMap synMap; - - static void parseRules(Iterable rules, SynonymMap map, String mappingSep, - String synSep, boolean expansion, TokenizerFactory tokFactory) { - int count=0; - for (String rule : rules) { - // To use regexes, we need an expression that specifies an odd number of chars. - // This can't really be done with string.split(), and since we need to - // do unescaping at some point anyway, we wouldn't be saving any effort - // by using regexes. - - List mapping = StrUtils.splitSmart(rule, mappingSep, false); - - List> source; - List> target; - - if (mapping.size() > 2) { - throw new RuntimeException("Invalid Synonym Rule:" + rule); - } else if (mapping.size()==2) { - source = getSynList(mapping.get(0), synSep, tokFactory); - target = getSynList(mapping.get(1), synSep, tokFactory); - } else { - source = getSynList(mapping.get(0), synSep, tokFactory); - if (expansion) { - // expand to all arguments - target = source; - } else { - // reduce to first argument - target = new ArrayList>(1); - target.add(source.get(0)); - } - } - - boolean includeOrig=false; - for (List fromToks : source) { - count++; - for (List toToks : target) { - map.add(fromToks, - SynonymMap.makeTokens(toToks), - includeOrig, - true - ); - } - } - } - } - - // a , b c , d e f => [[a],[b,c],[d,e,f]] - private static List> getSynList(String str, String separator, TokenizerFactory tokFactory) { - List strList = StrUtils.splitSmart(str, separator, false); - // now split on whitespace to get a list of token strings - List> synList = new ArrayList>(); - for (String toks : strList) { - List tokList = tokFactory == null ? - StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory); - synList.add(tokList); - } - return synList; - } - - private static List splitByTokenizer(String source, TokenizerFactory tokFactory){ - StringReader reader = new StringReader( source ); - TokenStream ts = loadTokenizer(tokFactory, reader); - List tokList = new ArrayList(); - try { - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - while (ts.incrementToken()){ - if( termAtt.length() > 0 ) - tokList.add( termAtt.toString() ); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - finally{ - reader.close(); - } - return tokList; - } - - private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map args){ - TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname ); - tokFactory.init( args ); - return tokFactory; - } - - private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){ - return tokFactory.create( reader ); - } - - public SynonymMap getSynonymMap() { - return synMap; - } - - public SynonymFilter create(TokenStream input) { - return new SynonymFilter(input,synMap); + assert delegator != null : "init() was not called!"; + ((ResourceLoaderAware) delegator).inform(loader); } } diff --git a/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java b/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java index f0dd0782567..6afda9bed98 100644 --- a/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java +++ b/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java @@ -17,30 +17,69 @@ package org.apache.solr.analysis; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.analysis.synonym.SynonymFilter; -import org.apache.lucene.analysis.synonym.SynonymMap; -import org.junit.Test; +import org.apache.solr.common.ResourceLoader; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.StringReader; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** * @since solr 1.4 */ public class TestMultiWordSynonyms extends BaseTokenTestCase { - @Test - public void testMultiWordSynonyms() throws IOException { + /** + * @deprecated Remove this test in 5.0 + */ + @Deprecated + public void testMultiWordSynonymsOld() throws IOException { List rules = new ArrayList(); rules.add("a b c,d"); - SynonymMap synMap = new SynonymMap(true); - SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null); + SlowSynonymMap synMap = new SlowSynonymMap(true); + SlowSynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null); - SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap); + SlowSynonymFilter ts = new SlowSynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap); // This fails because ["e","e"] is the value of the token stream assertTokenStreamContents(ts, new String[] { "a", "e" }); } + + public void testMultiWordSynonyms() throws IOException { + SynonymFilterFactory factory = new SynonymFilterFactory(); + Map args = new HashMap(); + args.putAll(DEFAULT_VERSION_PARAM); + args.put("synonyms", "synonyms.txt"); + factory.init(args); + factory.inform(new StringMockSolrResourceLoader("a b c,d")); + TokenStream ts = factory.create(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false)); + // This fails because ["e","e"] is the value of the token stream + assertTokenStreamContents(ts, new String[] { "a", "e" }); + } + + private class StringMockSolrResourceLoader implements ResourceLoader { + String text; + + StringMockSolrResourceLoader(String text) { + this.text = text; + } + + public List getLines(String resource) throws IOException { + return null; + } + + public Object newInstance(String cname, String... subpackages) { + return null; + } + + public InputStream openResource(String resource) throws IOException { + return new ByteArrayInputStream(text.getBytes("UTF-8")); + } + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java b/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java similarity index 92% rename from modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java rename to solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java index 82c2e1ce6ae..740ad33b17f 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java +++ b/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.analysis.synonym; +package org.apache.solr.analysis; import java.io.IOException; import java.io.StringReader; @@ -29,51 +29,52 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.*; /** + * @deprecated Remove this test in Lucene 5.0 */ -public class TestSynonymFilter extends BaseTokenStreamTestCase { +@Deprecated +public class TestSlowSynonymFilter extends BaseTokenStreamTestCase { static List strings(String str) { String[] arr = str.split(" "); return Arrays.asList(arr); } - static void assertTokenizesTo(SynonymMap dict, String input, + static void assertTokenizesTo(SlowSynonymMap dict, String input, String expected[]) throws IOException { Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); - SynonymFilter stream = new SynonymFilter(tokenizer, dict); + SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict); assertTokenStreamContents(stream, expected); } - static void assertTokenizesTo(SynonymMap dict, String input, + static void assertTokenizesTo(SlowSynonymMap dict, String input, String expected[], int posIncs[]) throws IOException { Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); - SynonymFilter stream = new SynonymFilter(tokenizer, dict); + SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict); assertTokenStreamContents(stream, expected, posIncs); } - static void assertTokenizesTo(SynonymMap dict, List input, + static void assertTokenizesTo(SlowSynonymMap dict, List input, String expected[], int posIncs[]) throws IOException { TokenStream tokenizer = new IterTokenStream(input); - SynonymFilter stream = new SynonymFilter(tokenizer, dict); + SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict); assertTokenStreamContents(stream, expected, posIncs); } - static void assertTokenizesTo(SynonymMap dict, List input, + static void assertTokenizesTo(SlowSynonymMap dict, List input, String expected[], int startOffsets[], int endOffsets[], int posIncs[]) throws IOException { TokenStream tokenizer = new IterTokenStream(input); - SynonymFilter stream = new SynonymFilter(tokenizer, dict); + SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict); assertTokenStreamContents(stream, expected, startOffsets, endOffsets, posIncs); } public void testMatching() throws IOException { - SynonymMap map = new SynonymMap(); + SlowSynonymMap map = new SlowSynonymMap(); boolean orig = false; boolean merge = true; @@ -110,7 +111,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase { } public void testIncludeOrig() throws IOException { - SynonymMap map = new SynonymMap(); + SlowSynonymMap map = new SlowSynonymMap(); boolean orig = true; boolean merge = true; @@ -167,7 +168,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase { public void testMapMerge() throws IOException { - SynonymMap map = new SynonymMap(); + SlowSynonymMap map = new SlowSynonymMap(); boolean orig = false; boolean merge = true; @@ -206,7 +207,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase { public void testOverlap() throws IOException { - SynonymMap map = new SynonymMap(); + SlowSynonymMap map = new SlowSynonymMap(); boolean orig = false; boolean merge = true; @@ -229,7 +230,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase { } public void testPositionIncrements() throws IOException { - SynonymMap map = new SynonymMap(); + SlowSynonymMap map = new SlowSynonymMap(); boolean orig = false; boolean merge = true; @@ -264,7 +265,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase { public void testPositionIncrementsWithOrig() throws IOException { - SynonymMap map = new SynonymMap(); + SlowSynonymMap map = new SlowSynonymMap(); boolean orig = true; boolean merge = true; @@ -304,7 +305,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase { // x=>y // analysing "a x" causes "y" to have a bad offset (end less than start) // SOLR-167 - SynonymMap map = new SynonymMap(); + SlowSynonymMap map = new SlowSynonymMap(); boolean orig = false; boolean merge = true; diff --git a/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java b/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java new file mode 100644 index 00000000000..24b4ef505a9 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java @@ -0,0 +1,62 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.synonym.SynonymFilter; +import org.apache.lucene.util.Version; +import org.apache.solr.core.SolrResourceLoader; + +public class TestSynonymFilterFactory extends BaseTokenTestCase { + /** test that we can parse and use the solr syn file */ + public void testSynonyms() throws Exception { + SynonymFilterFactory factory = new SynonymFilterFactory(); + Map args = new HashMap(); + args.putAll(DEFAULT_VERSION_PARAM); + args.put("synonyms", "synonyms.txt"); + factory.init(args); + factory.inform(new SolrResourceLoader(null, null)); + TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); + assertTrue(ts instanceof SynonymFilter); + assertTokenStreamContents(ts, + new String[] { "GB", "gib", "gigabyte", "gigabytes" }, + new int[] { 1, 0, 0, 0 }); + } + + /** test that we can parse and use the solr syn file, with the old impl + * @deprecated Remove this test in Lucene 5.0 */ + @Deprecated + public void testSynonymsOld() throws Exception { + SynonymFilterFactory factory = new SynonymFilterFactory(); + Map args = new HashMap(); + args.put("luceneMatchVersion", Version.LUCENE_33.toString()); + args.put("synonyms", "synonyms.txt"); + factory.init(args); + factory.inform(new SolrResourceLoader(null, null)); + TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); + assertTrue(ts instanceof SlowSynonymFilter); + assertTokenStreamContents(ts, + new String[] { "GB", "gib", "gigabyte", "gigabytes" }, + new int[] { 1, 0, 0, 0 }); + } +} diff --git a/solr/core/src/test/org/apache/solr/analysis/TestSynonymMap.java b/solr/core/src/test/org/apache/solr/analysis/TestSynonymMap.java index d3a6ee77873..66b3a5c7743 100644 --- a/solr/core/src/test/org/apache/solr/analysis/TestSynonymMap.java +++ b/solr/core/src/test/org/apache/solr/analysis/TestSynonymMap.java @@ -25,32 +25,35 @@ import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.common.ResourceLoader; +/** + * @deprecated Remove this test in Lucene 5.0 + */ +@Deprecated public class TestSynonymMap extends LuceneTestCase { public void testInvalidMappingRules() throws Exception { - SynonymMap synMap = new SynonymMap( true ); + SlowSynonymMap synMap = new SlowSynonymMap( true ); List rules = new ArrayList( 1 ); rules.add( "a=>b=>c" ); try{ - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); fail( "RuntimeException must be thrown." ); } catch( RuntimeException expected ){} } public void testReadMappingRules() throws Exception { - SynonymMap synMap; + SlowSynonymMap synMap; // (a)->[b] List rules = new ArrayList(); rules.add( "a=>b" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); assertEquals( 1, synMap.submap.size() ); assertTokIncludes( synMap, "a", "b" ); @@ -58,8 +61,8 @@ public class TestSynonymMap extends LuceneTestCase { // (b)->[c] rules.clear(); rules.add( "a,b=>c" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); assertEquals( 2, synMap.submap.size() ); assertTokIncludes( synMap, "a", "c" ); assertTokIncludes( synMap, "b", "c" ); @@ -67,8 +70,8 @@ public class TestSynonymMap extends LuceneTestCase { // (a)->[b][c] rules.clear(); rules.add( "a=>b,c" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); assertEquals( 1, synMap.submap.size() ); assertTokIncludes( synMap, "a", "b" ); assertTokIncludes( synMap, "a", "c" ); @@ -78,8 +81,8 @@ public class TestSynonymMap extends LuceneTestCase { rules.clear(); rules.add( "a=>a1" ); rules.add( "a b=>a2" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); assertEquals( 1, synMap.submap.size() ); assertTokIncludes( synMap, "a", "a1" ); assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() ); @@ -92,8 +95,8 @@ public class TestSynonymMap extends LuceneTestCase { rules.add( "a=>a1" ); rules.add( "a b=>a2" ); rules.add( "a c=>a3" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); assertEquals( 1, synMap.submap.size() ); assertTokIncludes( synMap, "a", "a1" ); assertEquals( 2, getSubSynonymMap( synMap, "a" ).submap.size() ); @@ -109,8 +112,8 @@ public class TestSynonymMap extends LuceneTestCase { rules.add( "a b=>a2" ); rules.add( "b=>b1" ); rules.add( "b c=>b2" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); assertEquals( 2, synMap.submap.size() ); assertTokIncludes( synMap, "a", "a1" ); assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() ); @@ -121,14 +124,14 @@ public class TestSynonymMap extends LuceneTestCase { } public void testRead1waySynonymRules() throws Exception { - SynonymMap synMap; + SlowSynonymMap synMap; // (a)->[a] // (b)->[a] List rules = new ArrayList(); rules.add( "a,b" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); assertEquals( 2, synMap.submap.size() ); assertTokIncludes( synMap, "a", "a" ); assertTokIncludes( synMap, "b", "a" ); @@ -138,8 +141,8 @@ public class TestSynonymMap extends LuceneTestCase { // (c)->[a] rules.clear(); rules.add( "a,b,c" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); assertEquals( 3, synMap.submap.size() ); assertTokIncludes( synMap, "a", "a" ); assertTokIncludes( synMap, "b", "a" ); @@ -149,8 +152,8 @@ public class TestSynonymMap extends LuceneTestCase { // (b1)->(b2)->[a] rules.clear(); rules.add( "a,b1 b2" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); assertEquals( 2, synMap.submap.size() ); assertTokIncludes( synMap, "a", "a" ); assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() ); @@ -160,8 +163,8 @@ public class TestSynonymMap extends LuceneTestCase { // (b)->[a1][a2] rules.clear(); rules.add( "a1 a2,b" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null); assertEquals( 2, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() ); assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" ); @@ -171,14 +174,14 @@ public class TestSynonymMap extends LuceneTestCase { } public void testRead2waySynonymRules() throws Exception { - SynonymMap synMap; + SlowSynonymMap synMap; // (a)->[a][b] // (b)->[a][b] List rules = new ArrayList(); rules.add( "a,b" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); assertEquals( 2, synMap.submap.size() ); assertTokIncludes( synMap, "a", "a" ); assertTokIncludes( synMap, "a", "b" ); @@ -190,8 +193,8 @@ public class TestSynonymMap extends LuceneTestCase { // (c)->[a][b][c] rules.clear(); rules.add( "a,b,c" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); assertEquals( 3, synMap.submap.size() ); assertTokIncludes( synMap, "a", "a" ); assertTokIncludes( synMap, "a", "b" ); @@ -209,8 +212,8 @@ public class TestSynonymMap extends LuceneTestCase { // [b1][b2] rules.clear(); rules.add( "a,b1 b2" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); assertEquals( 2, synMap.submap.size() ); assertTokIncludes( synMap, "a", "a" ); assertTokIncludes( synMap, "a", "b1" ); @@ -226,8 +229,8 @@ public class TestSynonymMap extends LuceneTestCase { // [b] rules.clear(); rules.add( "a1 a2,b" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null); assertEquals( 2, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() ); assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" ); @@ -239,7 +242,7 @@ public class TestSynonymMap extends LuceneTestCase { } public void testBigramTokenizer() throws Exception { - SynonymMap synMap; + SlowSynonymMap synMap; // prepare bi-gram tokenizer factory BaseTokenizerFactory tf = new NGramTokenizerFactory(); @@ -251,8 +254,8 @@ public class TestSynonymMap extends LuceneTestCase { // (ab)->(bc)->(cd)->[ef][fg][gh] List rules = new ArrayList(); rules.add( "abcd=>efgh" ); - synMap = new SynonymMap( true ); - SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); + synMap = new SlowSynonymMap( true ); + SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); assertEquals( 1, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() ); assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() ); @@ -265,7 +268,7 @@ public class TestSynonymMap extends LuceneTestCase { public void testLoadRules() throws Exception { Map args = new HashMap(); args.put( "synonyms", "something.txt" ); - SynonymFilterFactory ff = new SynonymFilterFactory(); + SlowSynonymFilterFactory ff = new SlowSynonymFilterFactory(); ff.init(args); ff.inform( new ResourceLoader() { @Override @@ -289,7 +292,7 @@ public class TestSynonymMap extends LuceneTestCase { } }); - SynonymMap synMap = ff.getSynonymMap(); + SlowSynonymMap synMap = ff.getSynonymMap(); assertEquals( 2, synMap.submap.size() ); assertTokIncludes( synMap, "a", "a" ); assertTokIncludes( synMap, "a", "b" ); @@ -298,7 +301,7 @@ public class TestSynonymMap extends LuceneTestCase { } - private void assertTokIncludes( SynonymMap map, String src, String exp ) throws Exception { + private void assertTokIncludes( SlowSynonymMap map, String src, String exp ) throws Exception { Token[] tokens = map.submap.get( src ).synonyms; boolean inc = false; for( Token token : tokens ){ @@ -308,7 +311,7 @@ public class TestSynonymMap extends LuceneTestCase { assertTrue( inc ); } - private SynonymMap getSubSynonymMap( SynonymMap map, String src ){ + private SlowSynonymMap getSubSynonymMap( SlowSynonymMap map, String src ){ return map.submap.get( src ); } }