LUCENE-3233: improve ram/perf of SynonymFilter, add wordnet parsing, nuke contrib/wordnet

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145158 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-07-11 12:58:52 +00:00
parent 19fd2508c6
commit 015ecfa0a0
55 changed files with 2945 additions and 2205 deletions

View File

@ -230,7 +230,6 @@
<packageset dir="contrib/misc/src/java"/>
<packageset dir="contrib/queries/src/java"/>
<packageset dir="contrib/spatial/src/java"/>
<packageset dir="contrib/wordnet/src/java"/>
<packageset dir="contrib/xml-query-parser/src/java"/>
<packageset dir="contrib/queryparser/src/java"/>
<!-- end alpha sort -->
@ -250,7 +249,6 @@
<group title="contrib: Queries" packages="org.apache.lucene.search.similar*:org.apache.lucene.search.regex*:org.apache.regexp*"/>
<group title="contrib: Query Parser" packages="org.apache.lucene.queryParser.*"/>
<group title="contrib: Spatial" packages="org.apache.lucene.spatial*"/>
<group title="contrib: WordNet" packages="org.apache.lucene.wordnet*"/>
<group title="contrib: XML Query Parser" packages="org.apache.lucene.xmlparser*"/>
</sources>

View File

@ -5,11 +5,6 @@ http://s.apache.org/luceneversions
======================= Trunk (not yet released) =======================
Changes in runtime behavior
* LUCENE-3250: Wordnet's SynExpand requires a non-null Analyzer (it no longer
treats null as StandardAnalyzer). (Robert Muir)
Build
* LUCENE-2845: Moved contrib/benchmark to modules.
@ -78,6 +73,10 @@ New Features
documents must be indexed as a document block, using
IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless)
* LUCENE-3233: Added SynonymFilter for applying multi-word synonyms
during indexing or querying (with parsers for wordnet and solr formats).
Removed contrib/wordnet. (Robert Muir, Mike McCandless)
API Changes
Bug Fixes

View File

@ -1,5 +0,0 @@
As of 2002-11-13 WordNet Lucene contribution contains a single Java class:
org.apache.lucene.wordnet.Syns2Index.
This class creates a Lucene index with synonyms for English words from
a Prolog file, which is a part of WordNet database.

View File

@ -1,70 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="wordnet" default="default">
<description>
WordNet
</description>
<property name="prolog.file" location="prologwn/wn_s.pl"/>
<property name="synindex.dir" location="index"/>
<available property="synindex.exists" file="${synindex.dir}" type="dir"/>
<import file="../contrib-build.xml"/>
<target name="index" depends="compile" description="Build WordNet index">
<fail if="synindex.exists">
Index already exists - must remove first.
</fail>
<java classname="org.apache.lucene.wordnet.Syns2Index">
<classpath>
<path refid="compile.classpath"/>
<pathelement location="${build.dir}/classes"/>
</classpath>
<arg file="${prolog.file}"/>
<arg file="${synindex.dir}"/>
</java>
</target>
<target name="synonym" description="Find synonyms for word">
<fail unless="synindex.exists">
Index does not exist.
</fail>
<fail unless="word">
Must specify 'word' property.
</fail>
<java classname="org.apache.lucene.wordnet.SynLookup">
<classpath>
<path refid="compile.classpath"/>
<pathelement location="${build.dir}/classes"/>
</classpath>
<arg file="${synindex.dir}"/>
<arg value="${word}"/>
</java>
</target>
</project>

View File

@ -1,142 +0,0 @@
package org.apache.lucene.wordnet;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
/**
* Expand a query by looking up synonyms for every term.
* You need to invoke {@link Syns2Index} first to build the synonym index.
*
* @see Syns2Index
*/
public final class SynExpand {
/**
* Perform synonym expansion on a query.
*
* @param query users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser.
*
* @param syns a opened to the Lucene index you previously created with {@link Syns2Index}. The searcher is not closed or otherwise altered.
*
* @param a analyzer used to parse the users query.
*
* @param f optional field name to search in or null if you want the default of "contents"
*
* @param boost optional boost applied to synonyms else no boost is applied
*
* @return the expanded Query
*/
public static Query expand( String query,
IndexSearcher syns,
Analyzer a,
String f,
final float boost)
throws IOException
{
final Set<String> already = new HashSet<String>(); // avoid dups
List<String> top = new LinkedList<String>(); // needs to be separately listed..
final String field = ( f == null) ? "contents" : f;
// [1] Parse query into separate words so that when we expand we can avoid dups
TokenStream ts = a.reusableTokenStream( field, new StringReader( query));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String word = termAtt.toString();
if ( already.add( word))
top.add( word);
}
ts.end();
ts.close();
final BooleanQuery tmp = new BooleanQuery();
// [2] form query
Iterator<String> it = top.iterator();
while ( it.hasNext())
{
// [2a] add to level words in
String word = it.next();
TermQuery tq = new TermQuery( new Term( field, word));
tmp.add( tq, BooleanClause.Occur.SHOULD);
syns.search(new TermQuery( new Term(Syns2Index.F_WORD, word)), new Collector() {
IndexReader reader;
@Override
public boolean acceptsDocsOutOfOrder() {
return true;
}
@Override
public void collect(int doc) throws IOException {
Document d = reader.document(doc);
String[] values = d.getValues( Syns2Index.F_SYN);
for ( int j = 0; j < values.length; j++)
{
String syn = values[ j];
if ( already.add( syn)) // avoid dups of top level words and synonyms
{
TermQuery tq = new TermQuery( new Term( field, syn));
if ( boost > 0) // else keep normal 1.0
tq.setBoost( boost);
tmp.add( tq, BooleanClause.Occur.SHOULD);
}
}
}
@Override
public void setNextReader(AtomicReaderContext context)
throws IOException {
this.reader = context.reader;
}
@Override
public void setScorer(Scorer scorer) throws IOException {}
});
// [2b] add in unique synonums
}
return tmp;
}
}

View File

@ -1,170 +0,0 @@
package org.apache.lucene.wordnet;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TotalHitCountCollector;
import org.apache.lucene.store.FSDirectory;
/**
* Test program to look up synonyms.
*/
public class SynLookup {
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.out.println(
"java org.apache.lucene.wordnet.SynLookup <index path> <word>");
}
FSDirectory directory = FSDirectory.open(new File(args[0]));
IndexSearcher searcher = new IndexSearcher(directory, true);
String word = args[1];
Query query = new TermQuery(new Term(Syns2Index.F_WORD, word));
TotalHitCountCollector countingCollector = new TotalHitCountCollector();
searcher.search(query, countingCollector);
if (countingCollector.getTotalHits() == 0) {
System.out.println("No synonyms found for " + word);
} else {
System.out.println("Synonyms found for \"" + word + "\":");
}
ScoreDoc[] hits = searcher.search(query, countingCollector.getTotalHits()).scoreDocs;
for (int i = 0; i < hits.length; i++) {
Document doc = searcher.doc(hits[i].doc);
String[] values = doc.getValues(Syns2Index.F_SYN);
for (int j = 0; j < values.length; j++) {
System.out.println(values[j]);
}
}
searcher.close();
directory.close();
}
/**
* Perform synonym expansion on a query.
*
* @param query
* @param syns
* @param a
* @param field
* @param boost
*/
public static Query expand( String query,
IndexSearcher syns,
Analyzer a,
final String field,
final float boost)
throws IOException
{
final Set<String> already = new HashSet<String>(); // avoid dups
List<String> top = new LinkedList<String>(); // needs to be separately listed..
// [1] Parse query into separate words so that when we expand we can avoid dups
TokenStream ts = a.reusableTokenStream( field, new StringReader( query));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
while (ts.incrementToken()) {
String word = termAtt.toString();
if ( already.add( word))
top.add( word);
}
final BooleanQuery tmp = new BooleanQuery();
// [2] form query
Iterator<String> it = top.iterator();
while ( it.hasNext())
{
// [2a] add to level words in
String word = it.next();
TermQuery tq = new TermQuery( new Term( field, word));
tmp.add( tq, BooleanClause.Occur.SHOULD);
// [2b] add in unique synonums
syns.search(new TermQuery( new Term(Syns2Index.F_WORD, word)), new Collector() {
IndexReader reader;
@Override
public boolean acceptsDocsOutOfOrder() {
return true;
}
@Override
public void collect(int doc) throws IOException {
Document d = reader.document(doc);
String[] values = d.getValues( Syns2Index.F_SYN);
for ( int j = 0; j < values.length; j++)
{
String syn = values[ j];
if ( already.add( syn))
{
TermQuery tq = new TermQuery( new Term( field, syn));
if ( boost > 0) // else keep normal 1.0
tq.setBoost( boost);
tmp.add( tq, BooleanClause.Occur.SHOULD);
}
}
}
@Override
public void setNextReader(AtomicReaderContext context)
throws IOException {
this.reader = context.reader;
}
@Override
public void setScorer(Scorer scorer) throws IOException {}
});
}
return tmp;
}
}

View File

@ -1,400 +0,0 @@
package org.apache.lucene.wordnet;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
/**
* Loads the <a target="_blank"
* href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a
* href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a>
* into a thread-safe main-memory hash map that can be used for fast
* high-frequency lookups of synonyms for any given (lowercase) word string.
* <p>
* There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
* There does not necessarily hold: A -> B, B -> C then A -> C.
* <p>
* Loading typically takes some 1.5 secs, so should be done only once per
* (server) program execution, using a singleton pattern. Once loaded, a
* synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
* A loaded default synonym map consumes about 10 MB main memory.
* An instance is immutable, hence thread-safe.
* <p>
* This implementation borrows some ideas from the Lucene Syns2Index demo that
* Dave Spencer originally contributed to Lucene. Dave's approach
* involved a persistent Lucene index which is suitable for occasional
* lookups or very large synonym tables, but considered unsuitable for
* high-frequency lookups of medium size synonym tables.
* <p>
* Example Usage:
* <pre class="prettyprint">
* String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
* SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
* for (int i = 0; i &lt; words.length; i++) {
* String[] synonyms = map.getSynonyms(words[i]);
* System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
* }
* </pre>
* <b/>
* Example output:
* <pre class="prettyprint">
* hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
* woods:[forest, wood]
* forest:[afforest, timber, timberland, wood, woodland, woods]
* wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
* xxxx:[]
* </pre>
*
* <p>
* <b>See also:</b><br>
* <a target="_blank"
* href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb
* man page </a><br>
* <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a>
*/
public class SynonymMap {
/** the index data; Map<String word, String[] synonyms> */
private final HashMap<String,String[]> table;
private static final String[] EMPTY = new String[0];
private static final boolean DEBUG = false;
/**
* Constructs an instance, loading WordNet synonym data from the given input
* stream. Finally closes the stream. The words in the stream must be in
* UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
*
* @param input
* the stream to read from (null indicates an empty synonym map)
* @throws IOException
* if an error occured while reading the stream.
*/
public SynonymMap(InputStream input) throws IOException {
this.table = input == null ? new HashMap<String,String[]>(0) : read(toByteArray(input));
}
/**
* Returns the synonym set for the given word, sorted ascending.
*
* @param word
* the word to lookup (must be in lowercase).
* @return the synonyms; a set of zero or more words, sorted ascending, each
* word containing lowercase characters that satisfy
* <code>Character.isLetter()</code>.
*/
public String[] getSynonyms(String word) {
String[] synonyms = table.get(word);
if (synonyms == null) return EMPTY;
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
return copy;
}
/**
* Returns a String representation of the index data for debugging purposes.
*
* @return a String representation
*/
@Override
public String toString() {
StringBuilder buf = new StringBuilder();
Iterator<String> iter = new TreeMap<String,String[]>(table).keySet().iterator();
int count = 0;
int f0 = 0;
int f1 = 0;
int f2 = 0;
int f3 = 0;
while (iter.hasNext()) {
String word = iter.next();
buf.append(word + ":");
String[] synonyms = getSynonyms(word);
buf.append(Arrays.asList(synonyms));
buf.append("\n");
count += synonyms.length;
if (synonyms.length == 0) f0++;
if (synonyms.length == 1) f1++;
if (synonyms.length == 2) f2++;
if (synonyms.length == 3) f3++;
}
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
return buf.toString();
}
/**
* Analyzes/transforms the given word on input stream loading. This default implementation simply
* lowercases the word. Override this method with a custom stemming
* algorithm or similar, if desired.
*
* @param word
* the word to analyze
* @return the same word, or a different word (or null to indicate that the
* word should be ignored)
*/
protected String analyze(String word) {
return word.toLowerCase();
}
protected boolean isValid(String str) {
for (int i=str.length(); --i >= 0; ) {
if (!Character.isLetter(str.charAt(i))) return false;
}
return true;
}
private HashMap<String,String[]> read(byte[] data) {
int WORDS = (int) (76401 / 0.7); // presizing
int GROUPS = (int) (88022 / 0.7); // presizing
HashMap<String,ArrayList<Integer>> word2Groups = new HashMap<String,ArrayList<Integer>>(WORDS); // Map<String word, int[] groups>
HashMap<Integer,ArrayList<String>> group2Words = new HashMap<Integer,ArrayList<String>>(GROUPS); // Map<int group, String[] words>
HashMap<String,String> internedWords = new HashMap<String,String>(WORDS);// Map<String word, String word>
Charset charset = Charset.forName("UTF-8");
int lastNum = -1;
Integer lastGroup = null;
int len = data.length;
int i=0;
while (i < len) { // until EOF
/* Part A: Parse a line */
// scan to beginning of group
while (i < len && data[i] != '(') i++;
if (i >= len) break; // EOF
i++;
// parse group
int num = 0;
while (i < len && data[i] != ',') {
num = 10*num + (data[i] - 48);
i++;
}
i++;
// if (DEBUG) System.err.println("num="+ num);
// scan to beginning of word
while (i < len && data[i] != '\'') i++;
i++;
// scan to end of word
int start = i;
do {
while (i < len && data[i] != '\'') i++;
i++;
} while (i < len && data[i] != ','); // word must end with "',"
if (i >= len) break; // EOF
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
// String word = new String(data, 0, start, i-start-1); // ASCII
/*
* Part B: ignore phrases (with spaces and hyphens) and
* non-alphabetic words, and let user customize word (e.g. do some
* stemming)
*/
if (!isValid(word)) continue; // ignore
word = analyze(word);
if (word == null || word.length() == 0) continue; // ignore
/* Part C: Add (group,word) to tables */
// ensure compact string representation, minimizing memory overhead
String w = internedWords.get(word);
if (w == null) {
word = new String(word); // ensure compact string
internedWords.put(word, word);
} else {
word = w;
}
Integer group = lastGroup;
if (num != lastNum) {
group = Integer.valueOf(num);
lastGroup = group;
lastNum = num;
}
// add word --> group
ArrayList<Integer> groups = word2Groups.get(word);
if (groups == null) {
groups = new ArrayList<Integer>(1);
word2Groups.put(word, groups);
}
groups.add(group);
// add group --> word
ArrayList<String> words = group2Words.get(group);
if (words == null) {
words = new ArrayList<String>(1);
group2Words.put(group, words);
}
words.add(word);
}
/* Part D: compute index data structure */
HashMap<String,String[]> word2Syns = createIndex(word2Groups, group2Words);
/* Part E: minimize memory consumption by a factor 3 (or so) */
// if (true) return word2Syns;
word2Groups = null; // help gc
//TODO: word2Groups.clear(); would be more appropriate ?
group2Words = null; // help gc
//TODO: group2Words.clear(); would be more appropriate ?
return optimize(word2Syns, internedWords);
}
private HashMap<String,String[]> createIndex(Map<String,ArrayList<Integer>> word2Groups, Map<Integer,ArrayList<String>> group2Words) {
HashMap<String,String[]> word2Syns = new HashMap<String,String[]>();
for (final Map.Entry<String,ArrayList<Integer>> entry : word2Groups.entrySet()) { // for each word
ArrayList<Integer> group = entry.getValue();
String word = entry.getKey();
// HashSet synonyms = new HashSet();
TreeSet<String> synonyms = new TreeSet<String>();
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
ArrayList<String> words = group2Words.get(group.get(i));
for (int j=words.size(); --j >= 0; ) { // add all words
String synonym = words.get(j); // note that w and word are interned
if (synonym != word) { // a word is implicitly it's own synonym
synonyms.add(synonym);
}
}
}
int size = synonyms.size();
if (size > 0) {
String[] syns = new String[size];
if (size == 1)
syns[0] = synonyms.first();
else
synonyms.toArray(syns);
// if (syns.length > 1) Arrays.sort(syns);
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
word2Syns.put(word, syns);
}
}
return word2Syns;
}
private HashMap<String,String[]> optimize(HashMap<String,String[]> word2Syns, HashMap<String,String> internedWords) {
if (DEBUG) {
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
// collect entries
int len = 0;
int size = word2Syns.size();
String[][] allSynonyms = new String[size][];
String[] words = new String[size];
Iterator<Map.Entry<String,String[]>> iter = word2Syns.entrySet().iterator();
for (int j=0; j < size; j++) {
Map.Entry<String,String[]> entry = iter.next();
allSynonyms[j] = entry.getValue();
words[j] = entry.getKey();
len += words[j].length();
}
// assemble large string containing all words
StringBuilder buf = new StringBuilder(len);
for (int j=0; j < size; j++) buf.append(words[j]);
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
buf = null;
// intern words at app level via memory-overlaid substrings
for (int p=0, j=0; j < size; j++) {
String word = words[j];
internedWords.put(word, allWords.substring(p, p + word.length()));
p += word.length();
}
// replace words with interned words
for (int j=0; j < size; j++) {
String[] syns = allSynonyms[j];
for (int k=syns.length; --k >= 0; ) {
syns[k] = internedWords.get(syns[k]);
}
word2Syns.remove(words[j]);
word2Syns.put(internedWords.get(words[j]), syns);
}
if (DEBUG) {
words = null;
allSynonyms = null;
internedWords = null;
allWords = null;
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
return word2Syns;
}
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
private static byte[] toByteArray(InputStream input) throws IOException {
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
input.close();
}
}
}

View File

@ -1,148 +0,0 @@
package org.apache.lucene.wordnet;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Injects additional tokens for synonyms of token terms fetched from the
* underlying child stream; the child stream must deliver lowercase tokens
* for synonyms to be found.
*
*/
public class SynonymTokenFilter extends TokenFilter {
/** The Token.type used to indicate a synonym to higher level filters. */
public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
private final SynonymMap synonyms;
private final int maxSynonyms;
private String[] stack = null;
private int index = 0;
private AttributeSource.State current = null;
private int todo = 0;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Creates an instance for the given underlying stream and synonym table.
*
* @param input
* the underlying child token stream
* @param synonyms
* the map used to extract synonyms for terms
* @param maxSynonyms
* the maximum number of synonym tokens to return per underlying
* token word (a value of Integer.MAX_VALUE indicates unlimited)
*/
public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
super(input);
if (input == null)
throw new IllegalArgumentException("input must not be null");
if (synonyms == null)
throw new IllegalArgumentException("synonyms must not be null");
if (maxSynonyms < 0)
throw new IllegalArgumentException("maxSynonyms must not be negative");
this.synonyms = synonyms;
this.maxSynonyms = maxSynonyms;
}
/** Returns the next token in the stream, or null at EOS. */
@Override
public final boolean incrementToken() throws IOException {
while (todo > 0 && index < stack.length) { // pop from stack
if (createToken(stack[index++], current)) {
todo--;
return true;
}
}
if (!input.incrementToken()) return false; // EOS; iterator exhausted
stack = synonyms.getSynonyms(termAtt.toString()); // push onto stack
if (stack.length > maxSynonyms) randomize(stack);
index = 0;
current = captureState();
todo = maxSynonyms;
return true;
}
/**
* Creates and returns a token for the given synonym of the current input
* token; Override for custom (stateless or stateful) behavior, if desired.
*
* @param synonym
* a synonym for the current token's term
* @param current
* the current token from the underlying child stream
* @return a new token, or null to indicate that the given synonym should be
* ignored
*/
protected boolean createToken(String synonym, AttributeSource.State current) {
restoreState(current);
termAtt.setEmpty().append(synonym);
typeAtt.setType(SYNONYM_TOKEN_TYPE);
posIncrAtt.setPositionIncrement(0);
return true;
}
/**
* Randomize synonyms to later sample a subset. Uses constant random seed
* for reproducibility. Uses "DRand", a simple, fast, uniform pseudo-random
* number generator with medium statistical quality (multiplicative
* congruential method), producing integers in the range [Integer.MIN_VALUE,
* Integer.MAX_VALUE].
*/
private static void randomize(Object[] arr) {
int seed = 1234567; // constant
int randomState = 4*seed + 1;
// Random random = new Random(seed); // unnecessary overhead
int len = arr.length;
for (int i=0; i < len-1; i++) {
randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
int r = randomState % (len-i);
if (r < 0) r = -r; // e.g. -9 % 2 == -1
// int r = random.nextInt(len-i);
// swap arr[i, i+r]
Object tmp = arr[i];
arr[i] = arr[i + r];
arr[i + r] = tmp;
}
}
@Override
public void reset() throws IOException {
super.reset();
stack = null;
index = 0;
current = null;
todo = 0;
}
}

View File

@ -1,329 +0,0 @@
package org.apache.lucene.wordnet;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.Reader;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
* into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}).
*
* This has been tested with WordNet 2.0.
*
* The index has fields named "word" ({@link #F_WORD})
* and "syn" ({@link #F_SYN}).
* <p>
* The source word (such as 'big') can be looked up in the
* "word" field, and if present there will be fields named "syn"
* for every synonym. What's tricky here is that there could be <b>multiple</b>
* fields with the same name, in the general case for words that have multiple synonyms.
* That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues}
* </p>
* <p>
* While the WordNet file distinguishes groups of synonyms with
* related meanings we don't do that here.
* </p>
*
* This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
*
* @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
* @see <a href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb man page</a>
* @see <a href="http://www.hostmon.com/rfc/advanced.jsp">sample site that uses it</a>
*/
public class Syns2Index
{
/**
*
*/
private static final PrintStream o = System.out;
/**
*
*/
private static final PrintStream err = System.err;
/**
*
*/
public static final String F_SYN = "syn";
/**
*
*/
public static final String F_WORD = "word";
/**
* we don't actually analyze any text (only a NOT_ANALYZED field),
* but analyzer can't be null, docinverter wants the offset gap!
*/
private static final Analyzer ana = new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return null;
}
};
/**
* Takes arg of prolog file name and index directory.
*/
public static void main(String[] args)
throws Throwable
{
// get command line arguments
String prologFilename = null; // name of file "wn_s.pl"
String indexDir = null;
if (args.length == 2)
{
prologFilename = args[0];
indexDir = args[1];
}
else
{
usage();
System.exit(1);
}
// ensure that the prolog file is readable
if (! (new File(prologFilename)).canRead())
{
err.println("Error: cannot read Prolog file: " + prologFilename);
System.exit(1);
}
// exit if the target index directory already exists
if ((new File(indexDir)).isDirectory())
{
err.println("Error: index directory already exists: " + indexDir);
err.println("Please specify a name of a non-existent directory");
System.exit(1);
}
o.println("Opening Prolog file " + prologFilename);
final FileInputStream fis = new FileInputStream(prologFilename);
final BufferedReader br = new BufferedReader(new InputStreamReader(fis));
String line;
// maps a word to all the "groups" it's in
final Map<String,List<String>> word2Nums = new TreeMap<String,List<String>>();
// maps a group to all the words in it
final Map<String,List<String>> num2Words = new TreeMap<String,List<String>>();
// number of rejected words
int ndecent = 0;
// status output
int mod = 1;
int row = 1;
// parse prolog file
o.println( "[1/2] Parsing " + prologFilename);
while ((line = br.readLine()) != null)
{
// occasional progress
if ((++row) % mod == 0) // periodically print out line we read in
{
mod *= 2;
o.println("\t" + row + " " + line + " " + word2Nums.size()
+ " " + num2Words.size() + " ndecent=" + ndecent);
}
// syntax check
if (! line.startsWith("s("))
{
err.println("OUCH: " + line);
System.exit(1);
}
// parse line
line = line.substring(2);
int comma = line.indexOf(',');
String num = line.substring(0, comma);
int q1 = line.indexOf('\'');
line = line.substring(q1 + 1);
int q2 = line.lastIndexOf('\'');
String word = line.substring(0, q2).toLowerCase().replace("''", "'");
// make sure is a normal word
if (! isDecent(word))
{
ndecent++;
continue; // don't store words w/ spaces
}
// 1/2: word2Nums map
// append to entry or add new one
List<String> lis = word2Nums.get(word);
if (lis == null)
{
lis = new LinkedList<String>();
lis.add(num);
word2Nums.put(word, lis);
}
else
lis.add(num);
// 2/2: num2Words map
lis = num2Words.get(num);
if (lis == null)
{
lis = new LinkedList<String>();
lis.add(word);
num2Words.put(num, lis);
}
else
lis.add(word);
}
// close the streams
fis.close();
br.close();
// create the index
o.println( "[2/2] Building index to store synonyms, " +
" map sizes are " + word2Nums.size() + " and " + num2Words.size());
index(indexDir, word2Nums, num2Words);
}
/**
* Checks to see if a word contains only alphabetic characters by
* checking it one character at a time.
*
* @param s string to check
* @return <code>true</code> if the string is decent
*/
private static boolean isDecent(String s)
{
int len = s.length();
for (int i = 0; i < len; i++)
{
if (!Character.isLetter(s.charAt(i)))
{
return false;
}
}
return true;
}
/**
* Forms a Lucene index based on the 2 maps.
*
* @param indexDir the directory where the index should be created
* @param word2Nums
* @param num2Words
*/
private static void index(String indexDir, Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words)
throws Throwable
{
int row = 0;
int mod = 1;
FSDirectory dir = FSDirectory.open(new File(indexDir));
try {
// override the specific index if it already exists
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
Version.LUCENE_CURRENT, ana).setOpenMode(OpenMode.CREATE));
((TieredMergePolicy) writer.getConfig().getMergePolicy()).setUseCompoundFile(true); // why?
Iterator<String> i1 = word2Nums.keySet().iterator();
while (i1.hasNext()) // for each word
{
String g = i1.next();
Document doc = new Document();
int n = index(word2Nums, num2Words, g, doc);
if (n > 0)
{
doc.add( new Field( F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED));
if ((++row % mod) == 0)
{
o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc);
mod *= 2;
}
writer.addDocument(doc);
} // else degenerate
}
o.println( "Optimizing..");
writer.optimize();
writer.close();
} finally {
dir.close();
}
}
/**
* Given the 2 maps fills a document for 1 word.
*/
private static int index(Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words, String g, Document doc)
throws Throwable
{
List<String> keys = word2Nums.get(g); // get list of key#'s
Iterator<String> i2 = keys.iterator();
Set<String> already = new TreeSet<String>(); // keep them sorted
// pass 1: fill up 'already' with all words
while (i2.hasNext()) // for each key#
{
already.addAll(num2Words.get(i2.next())); // get list of words
}
int num = 0;
already.remove(g); // of course a word is it's own syn
Iterator<String> it = already.iterator();
while (it.hasNext())
{
String cur = it.next();
// don't store things like 'pit bull' -> 'american pit bull'
if (!isDecent(cur))
{
continue;
}
num++;
doc.add( new Field( F_SYN, cur, Field.Store.YES, Field.Index.NO));
}
return num;
}
/**
*
*/
private static void usage()
{
o.println("\n\n" +
"java org.apache.lucene.wordnet.Syns2Index <prolog file> <index dir>\n\n");
}
}

View File

@ -1,57 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<title>WordNet Lucene Synonyms Integration</title>
</head>
<body>
This package uses synonyms defined by <a href="http://www.cogsci.princeton.edu/~wn/">WordNet</a>.
There are two methods: query expansion and analysis.
Both methods first require you to download the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog database</a>
Inside this archive is a file named wn_s.pl, which contains the WordNet synonyms.
<h1>Query Expansion Method</h1>
This method creates Lucene index storing the synonyms, which in turn can be used for query expansion.
You normally run {@link org.apache.lucene.wordnet.Syns2Index} once to build the query index/"database", and then call
{@link org.apache.lucene.wordnet.SynExpand#expand SynExpand.expand(...)} to expand a query.
<p>
<h3> Instructions </h3>
<ol>
<li> Invoke Syn2Index as appropriate to build a synonym index.
It'll take 2 arguments, the path to wn_s.pl from the WordNet download, and the index name.
<li> Update your UI so that as appropriate you call SynExpand.expand(...) to expand user queries with synonyms.
</ol>
<h1>Analysis Method</h1>
This method injects additional synonym tokens for tokens from a child {@link org.apache.lucene.analysis.TokenStream}.
<h3> Instructions </h3>
<ol>
<li>Create a {@link org.apache.lucene.wordnet.SynonymMap}, passing in the path to wn_s.pl
<li>Add a {@link org.apache.lucene.wordnet.SynonymTokenFilter} to your analyzer. Note: SynonymTokenFilter should be after LowerCaseFilter,
because it expects terms to already be in lowercase.
</ol>
</body>
</html>

View File

@ -1,119 +0,0 @@
package org.apache.lucene.wordnet;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
public class TestSynonymTokenFilter extends BaseTokenStreamTestCase {
final String testFile = "testSynonyms.txt";
public void testSynonyms() throws Exception {
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
/* all expansions */
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
assertAnalyzesTo(analyzer, "Lost in the woods",
new String[] { "lost", "in", "the", "woods", "forest", "wood" },
new int[] { 0, 5, 8, 12, 12, 12 },
new int[] { 4, 7, 11, 17, 17, 17 },
new int[] { 1, 1, 1, 1, 0, 0 });
}
public void testSynonymsSingleQuote() throws Exception {
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
/* all expansions */
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
assertAnalyzesTo(analyzer, "king",
new String[] { "king", "baron" });
}
public void testSynonymsLimitedAmount() throws Exception {
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
/* limit to one synonym expansion */
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
assertAnalyzesTo(analyzer, "Lost in the woods",
/* wood comes before forest due to
* the input file, not lexicographic order
*/
new String[] { "lost", "in", "the", "woods", "wood" },
new int[] { 0, 5, 8, 12, 12 },
new int[] { 4, 7, 11, 17, 17 },
new int[] { 1, 1, 1, 1, 0 });
}
public void testReusableTokenStream() throws Exception {
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
/* limit to one synonym expansion */
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
assertAnalyzesToReuse(analyzer, "Lost in the woods",
new String[] { "lost", "in", "the", "woods", "wood" },
new int[] { 0, 5, 8, 12, 12 },
new int[] { 4, 7, 11, 17, 17 },
new int[] { 1, 1, 1, 1, 0 });
assertAnalyzesToReuse(analyzer, "My wolfish dog went to the forest",
new String[] { "my", "wolfish", "ravenous", "dog", "went", "to",
"the", "forest", "woods" },
new int[] { 0, 3, 3, 11, 15, 20, 23, 27, 27 },
new int[] { 2, 10, 10, 14, 19, 22, 26, 33, 33 },
new int[] { 1, 1, 0, 1, 1, 1, 1, 1, 0 });
}
private class SynonymWhitespaceAnalyzer extends Analyzer {
private SynonymMap synonyms;
private int maxSynonyms;
public SynonymWhitespaceAnalyzer(SynonymMap synonyms, int maxSynonyms) {
this.synonyms = synonyms;
this.maxSynonyms = maxSynonyms;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream ts = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
ts = new SynonymTokenFilter(ts, synonyms, maxSynonyms);
return ts;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
streams.result = new SynonymTokenFilter(streams.source, synonyms, maxSynonyms);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
}
}
}

View File

@ -1,94 +0,0 @@
package org.apache.lucene.wordnet;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class TestWordnet extends LuceneTestCase {
private IndexSearcher searcher;
private Directory dir;
String storePathName = new File(TEMP_DIR,"testLuceneWordnet").getAbsolutePath();
@Override
public void setUp() throws Exception {
super.setUp();
// create a temporary synonym index
File testFile = getDataFile("testSynonyms.txt");
String commandLineArgs[] = { testFile.getAbsolutePath(), storePathName };
_TestUtil.rmDir(new File(storePathName));
try {
Syns2Index.main(commandLineArgs);
} catch (Throwable t) { throw new RuntimeException(t); }
dir = newFSDirectory(new File(storePathName));
searcher = new IndexSearcher(dir, true);
}
public void testExpansion() throws IOException {
assertExpandsTo("woods", new String[] { "woods", "forest", "wood" });
}
public void testExpansionSingleQuote() throws IOException {
assertExpandsTo("king", new String[] { "king", "baron" });
}
private void assertExpandsTo(String term, String expected[]) throws IOException {
Query expandedQuery = SynExpand.expand(term, searcher, new
MockAnalyzer(random), "field", 1F);
BooleanQuery expectedQuery = new BooleanQuery();
for (String t : expected)
expectedQuery.add(new TermQuery(new Term("field", t)),
BooleanClause.Occur.SHOULD);
assertEquals(expectedQuery, expandedQuery);
}
@Override
public void tearDown() throws Exception {
if (searcher != null) {
searcher.close();
}
if (dir != null) {
dir.close();
}
rmDir(storePathName); // delete our temporary synonym index
super.tearDown();
}
private void rmDir(String directory) {
File dir = new File(directory);
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) {
files[i].delete();
}
dir.delete();
}
}

View File

@ -1,9 +0,0 @@
s(100000001,1,'woods',n,1,0).
s(100000001,2,'wood',n,1,0).
s(100000001,3,'forest',n,1,0).
s(100000002,1,'wolfish',n,1,0).
s(100000002,2,'ravenous',n,1,0).
s(100000003,1,'king',n,1,1).
s(100000003,2,'baron',n,1,1).
s(100000004,1,'king''sevil',n,1,1).
s(100000004,2,'meany',n,1,1).

View File

@ -95,9 +95,6 @@ public class MemoryCodec extends Codec {
this.out = out;
this.field = field;
builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
// The byte[] output we create can easily be > 255 bytes:
builder.setAllowArrayArcs(false);
}
private class PostingsWriter extends PostingsConsumer {

View File

@ -0,0 +1,52 @@
package org.apache.lucene.store;
import org.apache.lucene.util.BytesRef;
/**
* @lucene.experimental
*/
public class ByteArrayDataOutput extends DataOutput {
private byte[] bytes;
private int pos;
private int limit;
public ByteArrayDataOutput(byte[] bytes) {
reset(bytes);
}
public ByteArrayDataOutput(byte[] bytes, int offset, int len) {
reset(bytes, offset, len);
}
public ByteArrayDataOutput() {
reset(BytesRef.EMPTY_BYTES);
}
public void reset(byte[] bytes) {
reset(bytes, 0, bytes.length);
}
public void reset(byte[] bytes, int offset, int len) {
this.bytes = bytes;
pos = offset;
limit = offset + len;
}
public int getPosition() {
return pos;
}
@Override
public void writeByte(byte b) {
assert pos < limit;
bytes[pos++] = b;
}
@Override
public void writeBytes(byte[] b, int offset, int length) {
assert pos + length <= limit;
System.arraycopy(b, offset, bytes, pos, length);
pos += length;
}
}

View File

@ -1,5 +1,7 @@
package org.apache.lucene.util;
import java.util.Comparator;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -167,7 +169,11 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence {
* the {@link CharsRef} to copy
*/
public void copy(CharsRef other) {
if (chars == null) {
chars = new char[other.length];
} else {
chars = ArrayUtil.grow(chars, other.length);
}
System.arraycopy(other.chars, other.offset, chars, 0, other.length);
length = other.length;
offset = 0;
@ -213,4 +219,56 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence {
public CharSequence subSequence(int start, int end) {
return new CharsRef(chars, offset + start, offset + end - 1);
}
private final static Comparator<CharsRef> utf16SortedAsUTF8SortOrder = new UTF16SortedAsUTF8Comparator();
public static Comparator<CharsRef> getUTF16SortedAsUTF8Comparator() {
return utf16SortedAsUTF8SortOrder;
}
private static class UTF16SortedAsUTF8Comparator implements Comparator<CharsRef> {
// Only singleton
private UTF16SortedAsUTF8Comparator() {};
public int compare(CharsRef a, CharsRef b) {
if (a == b)
return 0;
final char[] aChars = a.chars;
int aUpto = a.offset;
final char[] bChars = b.chars;
int bUpto = b.offset;
final int aStop = aUpto + Math.min(a.length, b.length);
while (aUpto < aStop) {
char aChar = aChars[aUpto++];
char bChar = bChars[bUpto++];
if (aChar != bChar) {
// http://icu-project.org/docs/papers/utf16_code_point_order.html
/* aChar != bChar, fix up each one if they're both in or above the surrogate range, then compare them */
if (aChar >= 0xd800 && bChar >= 0xd800) {
if (aChar >= 0xe000) {
aChar -= 0x800;
} else {
aChar += 0x2000;
}
if (bChar >= 0xe000) {
bChar -= 0x800;
} else {
bChar += 0x2000;
}
}
/* now aChar and bChar are in code point order */
return (int)aChar - (int)bChar; /* int must be 32 bits wide */
}
}
// One is a prefix of the other, or, they are equal:
return a.length - b.length;
}
}
}

View File

@ -71,7 +71,11 @@ public class FST<T> {
// Increment version to change it
private final static String FILE_FORMAT_NAME = "FST";
private final static int VERSION_START = 0;
private final static int VERSION_CURRENT = VERSION_START;
/** Changed numBytesPerArc for array'd case from byte to int. */
private final static int VERSION_INT_NUM_BYTES_PER_ARC = 1;
private final static int VERSION_CURRENT = VERSION_INT_NUM_BYTES_PER_ARC;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
@ -106,6 +110,8 @@ public class FST<T> {
private boolean allowArrayArcs = true;
private Arc<T> cachedRootArcs[];
public final static class Arc<T> {
public int label;
public T output;
@ -113,7 +119,7 @@ public class FST<T> {
int target;
byte flags;
T nextFinalOutput;
public T nextFinalOutput;
int nextArc;
// This is non-zero if current arcs are fixed array:
@ -176,7 +182,7 @@ public class FST<T> {
public FST(DataInput in, Outputs<T> outputs) throws IOException {
this.outputs = outputs;
writer = null;
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_START);
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_INT_NUM_BYTES_PER_ARC, VERSION_INT_NUM_BYTES_PER_ARC);
if (in.readByte() == 1) {
// accepts empty string
int numBytes = in.readVInt();
@ -209,6 +215,8 @@ public class FST<T> {
bytes = new byte[in.readVInt()];
in.readBytes(bytes, 0, bytes.length);
NO_OUTPUT = outputs.getNoOutput();
cacheRootArcs();
}
public INPUT_TYPE getInputType() {
@ -220,7 +228,7 @@ public class FST<T> {
return bytes.length;
}
void finish(int startNode) {
void finish(int startNode) throws IOException {
if (startNode == FINAL_END_NODE && emptyOutput != null) {
startNode = 0;
}
@ -231,6 +239,32 @@ public class FST<T> {
System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite);
bytes = finalBytes;
this.startNode = startNode;
cacheRootArcs();
}
// Caches first 128 labels
@SuppressWarnings("unchecked")
private void cacheRootArcs() throws IOException {
cachedRootArcs = (FST.Arc<T>[]) new FST.Arc[0x80];
final FST.Arc<T> arc = new FST.Arc<T>();
getFirstArc(arc);
final BytesReader in = getBytesReader(0);
if (targetHasArcs(arc)) {
readFirstRealArc(arc.target, arc);
while(true) {
assert arc.label != END_LABEL;
if (arc.label < cachedRootArcs.length) {
cachedRootArcs[arc.label] = new Arc<T>().copyFrom(arc);
} else {
break;
}
if (arc.isLast()) {
break;
}
readNextRealArc(arc, in);
}
}
}
void setEmptyOutput(T v) throws IOException {
@ -345,8 +379,9 @@ public class FST<T> {
writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY);
writer.writeVInt(node.numArcs);
// placeholder -- we'll come back and write the number
// of bytes per arc here:
writer.writeByte((byte) 0);
// of bytes per arc (int) here:
// TODO: we could make this a vInt instead
writer.writeInt(0);
fixedArrayStart = writer.posWrite;
//System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart);
} else {
@ -421,15 +456,21 @@ public class FST<T> {
}
}
// TODO: if arc'd arrays will be "too wasteful" by some
// measure, eg if arcs have vastly different sized
// outputs, then we should selectively disable array for
// such cases
if (doFixedArray) {
assert maxBytesPerArc > 0;
// 2nd pass just "expands" all arcs to take up a fixed
// byte size
final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc;
bytes = ArrayUtil.grow(bytes, sizeNeeded);
if (maxBytesPerArc > 255) {
throw new IllegalStateException("max arc size is too large (" + maxBytesPerArc + "); disable array arcs by calling Builder.setAllowArrayArcs(false)");
}
// TODO: we could make this a vInt instead
bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24);
bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16);
bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8);
bytes[fixedArrayStart-1] = (byte) maxBytesPerArc;
// expand the arcs in place, backwards
@ -502,7 +543,7 @@ public class FST<T> {
if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
// array: jump straight to end
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readByte() & 0xFF;
arc.bytesPerArc = in.readInt();
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
arc.posArcsStart = in.pos;
arc.arcIdx = arc.numArcs - 2;
@ -528,7 +569,7 @@ public class FST<T> {
}
arc.nextArc = in.pos+1;
}
readNextRealArc(arc);
readNextRealArc(arc, in);
assert arc.isLast();
return arc;
}
@ -572,7 +613,7 @@ public class FST<T> {
//System.out.println(" fixedArray");
// this is first arc in a fixed-array
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readByte() & 0xFF;
arc.bytesPerArc = in.readInt();
arc.arcIdx = -1;
arc.nextArc = arc.posArcsStart = in.pos;
//System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
@ -580,7 +621,7 @@ public class FST<T> {
arc.nextArc = address;
arc.bytesPerArc = 0;
}
return readNextRealArc(arc);
return readNextRealArc(arc, in);
}
/**
@ -609,7 +650,7 @@ public class FST<T> {
}
return readFirstRealArc(arc.nextArc, arc);
} else {
return readNextRealArc(arc);
return readNextRealArc(arc, getBytesReader(0));
}
}
@ -627,7 +668,7 @@ public class FST<T> {
//System.out.println(" nextArc fake array");
in.pos--;
in.readVInt();
in.readByte();
in.readInt();
}
} else {
if (arc.bytesPerArc != 0) {
@ -645,17 +686,16 @@ public class FST<T> {
return readLabel(in);
}
Arc<T> readNextRealArc(Arc<T> arc) throws IOException {
Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
// this is a continuing arc in a fixed array
final BytesReader in;
if (arc.bytesPerArc != 0) {
// arcs are at fixed entries
arc.arcIdx++;
assert arc.arcIdx < arc.numArcs;
in = getBytesReader(arc.posArcsStart - arc.arcIdx*arc.bytesPerArc);
in.pos = arc.posArcsStart - arc.arcIdx*arc.bytesPerArc;
} else {
// arcs are packed
in = getBytesReader(arc.nextArc);
in.pos = arc.nextArc;
}
arc.flags = in.readByte();
arc.label = readLabel(in);
@ -701,6 +741,17 @@ public class FST<T> {
/** Finds an arc leaving the incoming arc, replacing the arc in place.
* This returns null if the arc was not found, else the incoming arc. */
public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc) throws IOException {
assert cachedRootArcs != null;
// Short-circuit if this arc is in the root arc cache:
if (follow.target == startNode && labelToMatch != END_LABEL && labelToMatch < cachedRootArcs.length) {
final Arc<T> result = cachedRootArcs[labelToMatch];
if (result == null) {
return result;
} else {
arc.copyFrom(result);
return arc;
}
}
if (labelToMatch == END_LABEL) {
if (follow.isFinal()) {
@ -726,14 +777,18 @@ public class FST<T> {
// reusable stuff eg BytesReader:
final BytesReader in = getBytesReader(follow.target);
// System.out.println("fta label=" + (char) labelToMatch);
if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) {
// Arcs are full array; do binary search:
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readByte() & 0xFF;
//System.out.println(" bs " + arc.numArcs);
arc.bytesPerArc = in.readInt();
arc.posArcsStart = in.pos;
int low = 0;
int high = arc.numArcs-1;
while (low <= high) {
//System.out.println(" cycle");
int mid = (low + high) >>> 1;
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
int midLabel = readLabel(in);
@ -744,7 +799,8 @@ public class FST<T> {
high = mid - 1;
else {
arc.arcIdx = mid-1;
return readNextRealArc(arc);
//System.out.println(" found!");
return readNextRealArc(arc, in);
}
}
@ -754,7 +810,12 @@ public class FST<T> {
// Linear scan
readFirstTargetArc(follow, arc);
while(true) {
//System.out.println(" non-bs cycle");
// TODO: we should fix this code to not have to create
// object for the output of every arc we scan... only
// for the matching arc, if found
if (arc.label == labelToMatch) {
//System.out.println(" found!");
return arc;
} else if (arc.label > labelToMatch) {
return null;
@ -863,7 +924,7 @@ public class FST<T> {
}
// Non-static: reads byte[] from FST
class BytesReader extends DataInput {
final class BytesReader extends DataInput {
int pos;
public BytesReader(int pos) {

View File

@ -170,7 +170,7 @@ abstract class FSTEnum<T> {
if (found) {
// Match
arc.arcIdx = mid-1;
fst.readNextRealArc(arc);
fst.readNextRealArc(arc, in);
assert arc.arcIdx == mid;
assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
output[upto] = fst.outputs.add(output[upto-1], arc.output);
@ -185,7 +185,7 @@ abstract class FSTEnum<T> {
} else if (low == arc.numArcs) {
// Dead end
arc.arcIdx = arc.numArcs-2;
fst.readNextRealArc(arc);
fst.readNextRealArc(arc, in);
assert arc.isLast();
// Dead end (target is after the last arc);
// rollback to last fork then push
@ -205,7 +205,7 @@ abstract class FSTEnum<T> {
}
} else {
arc.arcIdx = (low > high ? low : high)-1;
fst.readNextRealArc(arc);
fst.readNextRealArc(arc, in);
assert arc.label > targetLabel;
pushFirst();
return;
@ -309,7 +309,7 @@ abstract class FSTEnum<T> {
// Match -- recurse
//System.out.println(" match! arcIdx=" + mid);
arc.arcIdx = mid-1;
fst.readNextRealArc(arc);
fst.readNextRealArc(arc, in);
assert arc.arcIdx == mid;
assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
output[upto] = fst.outputs.add(output[upto-1], arc.output);
@ -352,7 +352,7 @@ abstract class FSTEnum<T> {
// There is a floor arc:
arc.arcIdx = (low > high ? high : low)-1;
//System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1));
fst.readNextRealArc(arc);
fst.readNextRealArc(arc, in);
assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel;
assert arc.label < targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel;
pushLast();

View File

@ -35,6 +35,7 @@ final class NodeHash<T> {
}
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address) throws IOException {
final FST<T>.BytesReader in = fst.getBytesReader(0);
fst.readFirstRealArc(address, scratchArc);
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
return false;
@ -56,7 +57,7 @@ final class NodeHash<T> {
return false;
}
}
fst.readNextRealArc(scratchArc);
fst.readNextRealArc(scratchArc, in);
}
return false;
@ -87,6 +88,7 @@ final class NodeHash<T> {
// hash code for a frozen node
private int hash(int node) throws IOException {
final int PRIME = 31;
final FST<T>.BytesReader in = fst.getBytesReader(0);
//System.out.println("hash frozen");
int h = 0;
fst.readFirstRealArc(node, scratchArc);
@ -102,7 +104,7 @@ final class NodeHash<T> {
if (scratchArc.isLast()) {
break;
}
fst.readNextRealArc(scratchArc);
fst.readNextRealArc(scratchArc, in);
}
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
return h & Integer.MAX_VALUE;

View File

@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>

View File

@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>

View File

@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>

View File

@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>

View File

@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>

View File

@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>

View File

@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>
@ -359,12 +356,6 @@ document.write("Last Published: " + document.lastModified);
</li>
</ul>
<ul>
<li>
<a href="api/contrib-wordnet/index.html">Wordnet</a>&nbsp;&nbsp;___________________&nbsp;&nbsp;<em>javadoc-contrib-wordnet</em>
</li>
</ul>
<ul>
<li>
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>&nbsp;&nbsp;___________________&nbsp;&nbsp;<em>javadoc-contrib-xml-query-parser</em>

View File

@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
<a href="../api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="../api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="../api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>
@ -263,9 +260,6 @@ document.write("Last Published: " + document.lastModified);
<a href="#spellchecker">spellchecker</a>
</li>
<li>
<a href="#wordnet">wordnet</a>
</li>
<li>
<a href="#xml-query-parser">xml-query-parser</a>
</li>
</ul>
@ -375,12 +369,7 @@ document.write("Last Published: " + document.lastModified);
<p>Provides tools for spellchecking and suggestions with Lucene.</p>
<p>See <a href="../api/contrib-spellchecker/index.html">spellchecker javadoc</a>
</p>
<a name="N100DE"></a><a name="wordnet"></a>
<h3 class="boxed">wordnet</h3>
<p>Tools to help utilize wordnet synonyms with Lucene</p>
<p>See <a href="../api/contrib-wordnet/index.html">wordnet javadoc</a>
</p>
<a name="N100ED"></a><a name="xml-query-parser"></a>
<a name="N100DE"></a><a name="xml-query-parser"></a>
<h3 class="boxed">xml-query-parser</h3>
<p>A QueryParser that can read queries written in an XML format.</p>
<p>See <a href="../api/contrib-wordnet/index.html">xml-query-parser javadoc</a>

View File

@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>

View File

@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>

View File

@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
</div>
<div class="menuitem">
<a href="api/contrib-wordnet/index.html">Wordnet</a>
</div>
<div class="menuitem">
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
</div>
</div>

View File

@ -106,11 +106,6 @@
<p>See <a href="../api/contrib-spellchecker/index.html">spellchecker javadoc</a></p>
</section>
<section id="wordnet"><title>wordnet</title>
<p>Tools to help utilize wordnet synonyms with Lucene</p>
<p>See <a href="../api/contrib-wordnet/index.html">wordnet javadoc</a></p>
</section>
<section id="xml-query-parser"><title>xml-query-parser</title>
<p>A QueryParser that can read queries written in an XML format.</p>
<p>See <a href="../api/contrib-wordnet/index.html">xml-query-parser javadoc</a></p>

View File

@ -66,7 +66,6 @@ See http://forrest.apache.org/docs/linking.html for more info
<javadoc-contrib-remote label="Remote" href="ext:javadocs-contrib-remote"/>
<javadoc-contrib-spatial label="Spatial" href="ext:javadocs-contrib-spatial"/>
<javadoc-contrib-spellchecker label="Spellchecker" href="ext:javadocs-contrib-spellchecker"/>
<javadoc-contrib-wordnet label="Wordnet" href="ext:javadocs-contrib-wordnet"/>
<javadoc-contrib-xml-query-parser label="XML Query Parser" href="ext:javadocs-contrib-xml-query-parser"/>
</javadoc-contrib>
</javadoc>
@ -106,7 +105,6 @@ See http://forrest.apache.org/docs/linking.html for more info
<javadocs-contrib-remote href="api/contrib-remote/index.html"/>
<javadocs-contrib-spatial href="api/contrib-spatial/index.html"/>
<javadocs-contrib-spellchecker href="api/contrib-spellchecker/index.html"/>
<javadocs-contrib-wordnet href="api/contrib-wordnet/index.html"/>
<javadocs-contrib-xml-query-parser href="api/contrib-xml-query-parser/index.html"/>
<forrest href="http://forrest.apache.org/">

View File

@ -261,6 +261,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
text = _TestUtil.randomUnicodeString(random, maxWordLength);
}
if (VERBOSE) {
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
}
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text));
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
@ -286,6 +290,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
ts.close();
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
if (!tokens.isEmpty()) {
if (VERBOSE) {
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
}
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertAnalyzesToReuse(a, text,

View File

@ -31,6 +31,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
@ -166,6 +167,13 @@ public class TestIndexWriterCommit extends LuceneTestCase {
* measure max temp disk space used.
*/
public void testCommitOnCloseDiskUsage() throws IOException {
// MemoryCodec, since it uses FST, is not necessarily
// "additive", ie if you add up N small FSTs, then merge
// them, the merged result can easily be larger than the
// sum because the merged FST may use array encoding for
// some arcs (which uses more space):
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("id").equals("Memory"));
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("content").equals("Memory"));
MockDirectoryWrapper dir = newDirectory();
Analyzer analyzer;
if (random.nextBoolean()) {

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
@ -142,6 +143,14 @@ public class TestIndexWriterOnDiskFull extends LuceneTestCase {
*/
public void testAddIndexOnDiskFull() throws IOException
{
// MemoryCodec, since it uses FST, is not necessarily
// "additive", ie if you add up N small FSTs, then merge
// them, the merged result can easily be larger than the
// sum because the merged FST may use array encoding for
// some arcs (which uses more space):
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("id").equals("Memory"));
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("content").equals("Memory"));
int START_COUNT = 57;
int NUM_DIR = TEST_NIGHTLY ? 50 : 5;
int END_COUNT = START_COUNT + NUM_DIR* (TEST_NIGHTLY ? 25 : 5);

View File

@ -0,0 +1,41 @@
package org.apache.lucene.util;
import java.util.Arrays;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestCharsRef extends LuceneTestCase {
public void testUTF16InUTF8Order() {
final int numStrings = atLeast(1000);
BytesRef utf8[] = new BytesRef[numStrings];
CharsRef utf16[] = new CharsRef[numStrings];
for (int i = 0; i < numStrings; i++) {
String s = _TestUtil.randomUnicodeString(random);
utf8[i] = new BytesRef(s);
utf16[i] = new CharsRef(s);
}
Arrays.sort(utf8);
Arrays.sort(utf16, CharsRef.getUTF16SortedAsUTF8Comparator());
for (int i = 0; i < numStrings; i++) {
assertEquals(utf8[i].utf8ToString(), utf16[i].toString());
}
}
}

View File

@ -0,0 +1,179 @@
package org.apache.lucene.analysis.synonym;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.text.ParseException;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.CharsRef;
/**
* Parser for the Solr synonyms format.
* <ol>
* <li> Blank lines and lines starting with '#' are comments.
* <li> Explicit mappings match any token sequence on the LHS of "=>"
* and replace with all alternatives on the RHS. These types of mappings
* ignore the expand parameter in the constructor.
* Example:
* <blockquote>i-pod, i pod => ipod</blockquote>
* <li> Equivalent synonyms may be separated with commas and give
* no explicit mapping. In this case the mapping behavior will
* be taken from the expand parameter in the constructor. This allows
* the same synonym file to be used in different synonym handling strategies.
* Example:
* <blockquote>ipod, i-pod, i pod</blockquote>
*
* <li> Multiple synonym mapping entries are merged.
* Example:
* <blockquote>
* foo => foo bar<br>
* foo => baz<br><br>
* is equivalent to<br><br>
* foo => foo bar, baz
* </blockquote>
* </ol>
* @lucene.experimental
*/
public class SolrSynonymParser extends SynonymMap.Builder {
private final boolean expand;
private final Analyzer analyzer;
public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
super(dedup);
this.expand = expand;
this.analyzer = analyzer;
}
public void add(Reader in) throws IOException, ParseException {
LineNumberReader br = new LineNumberReader(in);
try {
addInternal(br);
} catch (IllegalArgumentException e) {
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
ex.initCause(e);
throw ex;
} finally {
br.close();
}
}
private void addInternal(BufferedReader in) throws IOException {
String line = null;
while ((line = in.readLine()) != null) {
if (line.length() == 0 || line.charAt(0) == '#') {
continue; // ignore empty lines and comments
}
CharsRef inputs[];
CharsRef outputs[];
// TODO: we could process this more efficiently.
String sides[] = split(line, "=>");
if (sides.length > 1) { // explicit mapping
if (sides.length != 2) {
throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
}
String inputStrings[] = split(sides[0], ",");
inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
}
String outputStrings[] = split(sides[1], ",");
outputs = new CharsRef[outputStrings.length];
for (int i = 0; i < outputs.length; i++) {
outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
}
} else {
String inputStrings[] = split(line, ",");
inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
}
if (expand) {
outputs = inputs;
} else {
outputs = new CharsRef[1];
outputs[0] = inputs[0];
}
}
// currently we include the term itself in the map,
// and use includeOrig = false always.
// this is how the existing filter does it, but its actually a bug,
// especially if combined with ignoreCase = true
for (int i = 0; i < inputs.length; i++) {
for (int j = 0; j < outputs.length; j++) {
add(inputs[i], outputs[j], false);
}
}
}
}
private static String[] split(String s, String separator) {
ArrayList<String> list = new ArrayList<String>(2);
StringBuilder sb = new StringBuilder();
int pos=0, end=s.length();
while (pos < end) {
if (s.startsWith(separator,pos)) {
if (sb.length() > 0) {
list.add(sb.toString());
sb=new StringBuilder();
}
pos+=separator.length();
continue;
}
char ch = s.charAt(pos++);
if (ch=='\\') {
sb.append(ch);
if (pos>=end) break; // ERROR, or let it go?
ch = s.charAt(pos++);
}
sb.append(ch);
}
if (sb.length() > 0) {
list.add(sb.toString());
}
return list.toArray(new String[list.size()]);
}
private String unescape(String s) {
if (s.indexOf("\\") >= 0) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
if (ch == '\\' && i < s.length() - 1) {
sb.append(s.charAt(++i));
} else {
sb.append(ch);
}
}
return sb.toString();
}
return s;
}
}

View File

@ -1,3 +1,5 @@
package org.apache.lucene.analysis.synonym;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -15,245 +17,550 @@
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import java.io.IOException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
* <p>
* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
* or discarded. If the original tokens are included, the position increments may be modified
* to retain absolute positions after merging with the synonym tokenstream.
* <p>
* Generated synonyms will start at the same position as the first matched source token.
/**
* Matches single or multi word synonyms in a token stream.
* This token stream cannot properly handle position
* increments != 1, ie, you should place this filter before
* filtering out stop words.
*
* <p>Note that with the current implementation, parsing is
* greedy, so whenever multiple parses would apply, the rule
* starting the earliest and parsing the most tokens wins.
* For example if you have these rules:
*
* <pre>
* a -> x
* a b -> y
* b c d -> z
* </pre>
*
* Then input <code>a b c d e</code> parses to <code>y b c
* d</code>, ie the 2nd rule "wins" because it started
* earliest and matched the most input tokens of other rules
* starting at that point.</p>
*
* <p>A future improvement to this filter could allow
* non-greedy parsing, such that the 3rd rule would win, and
* also separately allow multiple parses, such that all 3
* rules would match, perhaps even on a rule by rule
* basis.</p>
*
* <p><b>NOTE</b>: when a match occurs, the output tokens
* associated with the matching rule are "stacked" on top of
* the input stream (if the rule had
* <code>keepOrig=true</code>) and also on top of aother
* matched rule's output tokens. This is not a correct
* solution, as really the output should be an abitrary
* graph/lattice. For example, with the above match, you
* would expect an exact <code>PhraseQuery</code> <code>"y b
* c"</code> to match the parsed tokens, but it will fail to
* do so. This limitations is necessary because Lucene's
* TokenStream (and index) cannot yet represent an arbitrary
* graph.</p>
*
* <p><b>NOTE</b>: If multiple incoming tokens arrive on the
* same position, only the first token at that position is
* used for parsing. Subsequent tokens simply pass through
* and are not parsed. A future improvement would be to
* allow these tokens to also be matched.</p>
*/
// TODO: maybe we should resolve token -> wordID then run
// FST on wordIDs, for better perf?
// TODO: a more efficient approach would be Aho/Corasick's
// algorithm
// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
// It improves over the current approach here
// because it does not fully re-start matching at every
// token. For exampl,e if one pattern is "a b c x"
// and another is "b c d" and the input is "a b c d", on
// trying to parse "a b c x" but failing when you got to x,
// rather than starting over again your really should
// immediately recognize that "b c d" matches at the next
// input. I suspect this won't matter that much in
// practice, but it's possible on some set of synonyms it
// will. We'd have to modify Aho/Corasick to enforce our
// conflict resolving (eg greedy matching) because that algo
// finds all matches.
public final class SynonymFilter extends TokenFilter {
private final SynonymMap map; // Map<String, SynonymMap>
private Iterator<AttributeSource> replacement; // iterator over generated tokens
public static final String TYPE_SYNONYM = "SYNONYM";
public SynonymFilter(TokenStream in, SynonymMap map) {
super(in);
if (map == null)
throw new IllegalArgumentException("map is required");
private final SynonymMap synonyms;
this.map = map;
// just ensuring these attributes exist...
addAttribute(CharTermAttribute.class);
addAttribute(PositionIncrementAttribute.class);
addAttribute(OffsetAttribute.class);
addAttribute(TypeAttribute.class);
private final boolean ignoreCase;
private final int rollBufferSize;
private int captureCount;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
// How many future input tokens have already been matched
// to a synonym; because the matching is "greedy" we don't
// try to do any more matching for such tokens:
private int inputSkipCount;
// Hold all buffered (read ahead) stacked input tokens for
// a future position. When multiple tokens are at the
// same position, we only store (and match against) the
// term for the first token at the position, but capture
// state for (and enumerate) all other tokens at this
// position:
private static class PendingInput {
final CharsRef term = new CharsRef();
AttributeSource.State state;
boolean keepOrig;
boolean consumed = true;
int startOffset;
int endOffset;
public void reset() {
state = null;
consumed = true;
keepOrig = false;
}
};
// Rolling buffer, holding pending input tokens we had to
// clone because we needed to look ahead, indexed by
// position:
private final PendingInput[] futureInputs;
// Holds pending output synonyms for one future position:
private static class PendingOutputs {
CharsRef[] outputs;
int upto;
int count;
int posIncr = 1;
public PendingOutputs() {
outputs = new CharsRef[1];
}
/*
* Need to worry about multiple scenarios:
* - need to go for the longest match
* a b => foo #shouldn't match if "a b" is followed by "c d"
* a b c d => bar
* - need to backtrack - retry matches for tokens already read
* a b c d => foo
* b c => bar
* If the input stream is "a b c x", one will consume "a b c d"
* trying to match the first rule... all but "a" should be
* pushed back so a match may be made on "b c".
* - don't try and match generated tokens (thus need separate queue)
* matching is not recursive.
* - handle optional generation of original tokens in all these cases,
* merging token streams to preserve token positions.
* - preserve original positionIncrement of first matched token
*/
@Override
public boolean incrementToken() throws IOException {
while (true) {
// if there are any generated tokens, return them... don't try any
// matches against them, as we specifically don't want recursion.
if (replacement!=null && replacement.hasNext()) {
copy(this, replacement.next());
return true;
public void reset() {
upto = count = 0;
posIncr = 1;
}
// common case fast-path of first token not matching anything
AttributeSource firstTok = nextTok();
if (firstTok == null) return false;
CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
SynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
if (result == null) {
copy(this, firstTok);
return true;
public CharsRef pullNext() {
assert upto < count;
final CharsRef result = outputs[upto++];
posIncr = 0;
if (upto == count) {
reset();
}
// fast-path failed, clone ourselves if needed
if (firstTok == this)
firstTok = cloneAttributes();
// OK, we matched a token, so find the longest match.
matched = new LinkedList<AttributeSource>();
result = match(result);
if (result==null) {
// no match, simply return the first token read.
copy(this, firstTok);
return true;
}
// reuse, or create new one each time?
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
//
// there was a match... let's generate the new tokens, merging
// in the matched tokens (position increments need adjusting)
//
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
boolean includeOrig = result.includeOrig();
AttributeSource origTok = includeOrig ? firstTok : null;
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
int repPos=0; // curr position in replacement token stream
int pos=0; // current position in merged token stream
for (int i=0; i<result.synonyms.length; i++) {
Token repTok = result.synonyms[i];
AttributeSource newTok = firstTok.cloneAttributes();
CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
repPos += repTok.getPositionIncrement();
if (i==0) repPos=origPos; // make position of first token equal to original
// if necessary, insert original tokens and adjust position increment
while (origTok != null && origPos <= repPos) {
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) {
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
newPosIncAtt.setPositionIncrement(repPos - pos);
generated.add(newTok);
pos += newPosIncAtt.getPositionIncrement();
}
// finish up any leftover original tokens
while (origTok!=null) {
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) {
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
// what if we replaced a longer sequence with a shorter one?
// a/0 b/5 => foo/0
// should I re-create the gap on the next buffered token?
replacement = generated.iterator();
// Now return to the top of the loop to read and return the first
// generated token.. The reason this is done is that we may have generated
// nothing at all, and may need to continue with more matching logic.
}
}
//
// Defer creation of the buffer until the first time it is used to
// optimize short fields with no matches.
//
private LinkedList<AttributeSource> buffer;
private LinkedList<AttributeSource> matched;
private boolean exhausted;
private AttributeSource nextTok() throws IOException {
if (buffer!=null && !buffer.isEmpty()) {
return buffer.removeFirst();
} else {
if (!exhausted && input.incrementToken()) {
return this;
} else {
exhausted = true;
return null;
}
}
}
private void pushTok(AttributeSource t) {
if (buffer==null) buffer=new LinkedList<AttributeSource>();
buffer.addFirst(t);
}
private SynonymMap match(SynonymMap map) throws IOException {
SynonymMap result = null;
if (map.submap != null) {
AttributeSource tok = nextTok();
if (tok != null) {
// clone ourselves.
if (tok == this)
tok = cloneAttributes();
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
SynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
if (subMap != null) {
// recurse
result = match(subMap);
}
if (result != null) {
matched.addFirst(tok);
} else {
// push back unmatched token
pushTok(tok);
}
}
}
// if no longer sequence matched, so if this node has synonyms, it's the match.
if (result==null && map.synonyms!=null) {
result = map;
}
return result;
}
private void copy(AttributeSource target, AttributeSource source) {
if (target != source)
source.copyTo(target);
public void add(char[] output, int offset, int len) {
if (count == outputs.length) {
final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(outputs, 0, next, 0, count);
outputs = next;
}
if (outputs[count] == null) {
outputs[count] = new CharsRef();
}
outputs[count].copy(output, offset, len);
count++;
}
};
private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
// Rolling buffer, holding stack of pending synonym
// outputs, indexed by position:
private final PendingOutputs[] futureOutputs;
// Where (in rolling buffers) to write next input saved state:
private int nextWrite;
// Where (in rolling buffers) to read next input saved state:
private int nextRead;
// True once we've read last token
private boolean finished;
private final FST.Arc<BytesRef> scratchArc;
private final FST<BytesRef> fst;
private final BytesRef scratchBytes = new BytesRef();
private final CharsRef scratchChars = new CharsRef();
/**
* @param input input tokenstream
* @param synonyms synonym map
* @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
* Note, if you set this to true, its your responsibility to lowercase
* the input entries when you create the {@link SynonymMap}
*/
public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
super(input);
this.synonyms = synonyms;
this.ignoreCase = ignoreCase;
this.fst = synonyms.fst;
if (fst == null) {
throw new IllegalArgumentException("fst must be non-null");
}
// Must be 1+ so that when roll buffer is at full
// lookahead we can distinguish this full buffer from
// the empty buffer:
rollBufferSize = 1+synonyms.maxHorizontalContext;
futureInputs = new PendingInput[rollBufferSize];
futureOutputs = new PendingOutputs[rollBufferSize];
for(int pos=0;pos<rollBufferSize;pos++) {
futureInputs[pos] = new PendingInput();
futureOutputs[pos] = new PendingOutputs();
}
//System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);
scratchArc = new FST.Arc<BytesRef>();
}
private void capture() {
captureCount++;
//System.out.println(" capture slot=" + nextWrite);
final PendingInput input = futureInputs[nextWrite];
input.state = captureState();
input.consumed = false;
input.term.copy(termAtt.buffer(), 0, termAtt.length());
nextWrite = rollIncr(nextWrite);
// Buffer head should never catch up to tail:
assert nextWrite != nextRead;
}
/*
This is the core of this TokenFilter: it locates the
synonym matches and buffers up the results into
futureInputs/Outputs.
NOTE: this calls input.incrementToken and does not
capture the state if no further tokens were checked. So
caller must then forward state to our caller, or capture:
*/
private void parse() throws IOException {
//System.out.println("\nS: parse");
assert inputSkipCount == 0;
int curNextRead = nextRead;
// Holds the longest match we've seen so far:
BytesRef matchOutput = null;
int matchInputLength = 0;
BytesRef pendingOutput = fst.outputs.getNoOutput();
fst.getFirstArc(scratchArc);
assert scratchArc.output == fst.outputs.getNoOutput();
int tokenCount = 0;
byToken:
while(true) {
// Pull next token's chars:
final char[] buffer;
final int bufferLen;
//System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
if (curNextRead == nextWrite) {
// We used up our lookahead buffer of input tokens
// -- pull next real input token:
if (finished) {
break;
} else {
//System.out.println(" input.incrToken");
assert futureInputs[nextWrite].consumed;
// Not correct: a syn match whose output is longer
// than its input can set future inputs keepOrig
// to true:
//assert !futureInputs[nextWrite].keepOrig;
if (input.incrementToken()) {
buffer = termAtt.buffer();
bufferLen = termAtt.length();
final PendingInput input = futureInputs[nextWrite];
input.startOffset = offsetAtt.startOffset();
input.endOffset = offsetAtt.endOffset();
//System.out.println(" new token=" + new String(buffer, 0, bufferLen));
if (nextRead != nextWrite) {
capture();
} else {
input.consumed = false;
}
} else {
// No more input tokens
//System.out.println(" set end");
finished = true;
break;
}
}
} else {
// Still in our lookahead
buffer = futureInputs[curNextRead].term.chars;
bufferLen = futureInputs[curNextRead].term.length;
//System.out.println(" old token=" + new String(buffer, 0, bufferLen));
}
tokenCount++;
// Run each char in this token through the FST:
int bufUpto = 0;
while(bufUpto < bufferLen) {
final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) {
//System.out.println(" stop");
break byToken;
}
// Accum the output
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
//System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
bufUpto += Character.charCount(codePoint);
}
// OK, entire token matched; now see if this is a final
// state:
if (scratchArc.isFinal()) {
matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
matchInputLength = tokenCount;
//System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
}
// See if the FST wants to continue matching (ie, needs to
// see the next input token):
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
// No further rules can match here; we're done
// searching for matching rules starting at the
// current input position.
break;
} else {
// More matching is possible -- accum the output (if
// any) of the WORD_SEP arc:
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
if (nextRead == nextWrite) {
capture();
}
}
curNextRead = rollIncr(curNextRead);
}
if (nextRead == nextWrite && !finished) {
//System.out.println(" skip write slot=" + nextWrite);
nextWrite = rollIncr(nextWrite);
}
if (matchOutput != null) {
//System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
inputSkipCount = matchInputLength;
addOutput(matchOutput);
} else if (nextRead != nextWrite) {
// Even though we had no match here, we set to 1
// because we need to skip current input token before
// trying to match again:
inputSkipCount = 1;
} else {
assert finished;
}
//System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
}
// Interleaves all output tokens onto the futureOutputs:
private void addOutput(BytesRef bytes) {
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
final int code = bytesReader.readVInt();
final boolean keepOrig = (code & 0x1) == 0;
final int count = code >>> 1;
//System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig);
for(int outputIDX=0;outputIDX<count;outputIDX++) {
synonyms.words.get(bytesReader.readVInt(),
scratchBytes);
//System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
int lastStart = scratchChars.offset;
final int chEnd = lastStart + scratchChars.length;
int outputUpto = nextRead;
for(int chIDX=lastStart;chIDX<=chEnd;chIDX++) {
if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
final int outputLen = chIDX - lastStart;
// Caller is not allowed to have empty string in
// the output:
assert outputLen > 0: "output contains empty string: " + scratchChars;
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
lastStart = 1+chIDX;
futureInputs[outputUpto].keepOrig |= keepOrig;
//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
outputUpto = rollIncr(outputUpto);
assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
}
}
}
}
// ++ mod rollBufferSize
private int rollIncr(int count) {
count++;
if (count == rollBufferSize) {
return 0;
} else {
return count;
}
}
// for testing
int getCaptureCount() {
return captureCount;
}
@Override
public boolean incrementToken() throws IOException {
//System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
while(true) {
// First play back any buffered future inputs/outputs
// w/o running parsing again:
while (inputSkipCount != 0) {
// At each position, we first output the original
// token
// TODO: maybe just a PendingState class, holding
// both input & outputs?
final PendingInput input = futureInputs[nextRead];
final PendingOutputs outputs = futureOutputs[nextRead];
//System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);
if (!input.consumed && (input.keepOrig || outputs.count == 0)) {
if (input.state != null) {
// Return a previously saved token (because we
// had to lookahead):
restoreState(input.state);
} else {
// Pass-through case: return token we just pulled
// but didn't capture:
assert inputSkipCount == 1: "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
}
input.reset();
if (outputs.count > 0) {
outputs.posIncr = 0;
} else {
nextRead = rollIncr(nextRead);
inputSkipCount--;
}
//System.out.println(" return token=" + termAtt.toString());
return true;
} else if (outputs.upto < outputs.count) {
// Still have pending outputs to replay at this
// position
input.reset();
final int posIncr = outputs.posIncr;
final CharsRef output = outputs.pullNext();
clearAttributes();
termAtt.copyBuffer(output.chars, output.offset, output.length);
typeAtt.setType(TYPE_SYNONYM);
offsetAtt.setOffset(input.startOffset, input.endOffset);
posIncrAtt.setPositionIncrement(posIncr);
if (outputs.count == 0) {
// Done with the buffered input and all outputs at
// this position
nextRead = rollIncr(nextRead);
inputSkipCount--;
}
//System.out.println(" return token=" + termAtt.toString());
return true;
} else {
// Done with the buffered input and all outputs at
// this position
input.reset();
nextRead = rollIncr(nextRead);
inputSkipCount--;
}
}
if (finished && nextRead == nextWrite) {
// End case: if any output syns went beyond end of
// input stream, enumerate them now:
final PendingOutputs outputs = futureOutputs[nextRead];
if (outputs.upto < outputs.count) {
final int posIncr = outputs.posIncr;
final CharsRef output = outputs.pullNext();
futureInputs[nextRead].reset();
if (outputs.count == 0) {
nextWrite = nextRead = rollIncr(nextRead);
}
clearAttributes();
termAtt.copyBuffer(output.chars, output.offset, output.length);
typeAtt.setType(TYPE_SYNONYM);
//System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs);
posIncrAtt.setPositionIncrement(posIncr);
//System.out.println(" return token=" + termAtt.toString());
return true;
} else {
return false;
}
}
// Find new synonym matches:
parse();
}
}
@Override
public void reset() throws IOException {
super.reset();
captureCount = 0;
finished = false;
// In normal usage these resets would not be needed,
// since they reset-as-they-are-consumed, but the app
// may not consume all input tokens in which case we
// have leftover state here:
for (PendingInput input : futureInputs) {
input.reset();
replacement = null;
exhausted = false;
}
for (PendingOutputs output : futureOutputs) {
output.reset();
}
}
}

View File

@ -1,3 +1,5 @@
package org.apache.lucene.analysis.synonym;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -15,146 +17,301 @@
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import java.util.*;
/** Mapping rules for use with {@link SynonymFilter}
/**
* A map of synonyms, keys and values are phrases.
* @lucene.experimental
*/
public class SynonymMap {
/** @lucene.internal */
public CharArrayMap<SynonymMap> submap; // recursive: Map<String, SynonymMap>
/** @lucene.internal */
public Token[] synonyms;
int flags;
/** for multiword support, you must separate words with this separator */
public static final char WORD_SEPARATOR = 0;
/** map<input word, list<ord>> */
public final FST<BytesRef> fst;
/** map<ord, outputword> */
public final BytesRefHash words;
/** maxHorizontalContext: maximum context we need on the tokenstream */
public final int maxHorizontalContext;
static final int INCLUDE_ORIG=0x01;
static final int IGNORE_CASE=0x02;
public SynonymMap() {}
public SynonymMap(boolean ignoreCase) {
if (ignoreCase) flags |= IGNORE_CASE;
public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext) {
this.fst = fst;
this.words = words;
this.maxHorizontalContext = maxHorizontalContext;
}
public boolean includeOrig() { return (flags & INCLUDE_ORIG) != 0; }
public boolean ignoreCase() { return (flags & IGNORE_CASE) != 0; }
/**
* @param singleMatch List<String>, the sequence of strings to match
* @param replacement List<Token> the list of tokens to use on a match
* @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
* @param mergeExisting merge the replacement tokens with any other mappings that exist
* Builds an FSTSynonymMap.
* <p>
* Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap
* @lucene.experimental
*/
public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
SynonymMap currMap = this;
for (String str : singleMatch) {
if (currMap.submap==null) {
// for now hardcode at 4.0, as its what the old code did.
// would be nice to fix, but shouldn't store a version in each submap!!!
currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_40, 1, ignoreCase());
public static class Builder {
private final HashMap<CharsRef,MapEntry> workingSet = new HashMap<CharsRef,MapEntry>();
private final BytesRefHash words = new BytesRefHash();
private final BytesRef utf8Scratch = new BytesRef(8);
private int maxHorizontalContext;
private final boolean dedup;
/** If dedup is true then identical rules (same input,
* same output) will be added only once. */
public Builder(boolean dedup) {
this.dedup = dedup;
}
SynonymMap map = currMap.submap.get(str);
if (map==null) {
map = new SynonymMap();
map.flags |= flags & IGNORE_CASE;
currMap.submap.put(str, map);
private static class MapEntry {
boolean includeOrig;
// we could sort for better sharing ultimately, but it could confuse people
ArrayList<Integer> ords = new ArrayList<Integer>();
}
currMap = map;
/** Sugar: just joins the provided terms with {@link
* SynonymMap#WORD_SEPARATOR}. reuse and its chars
* must not be null. */
public static CharsRef join(String[] words, CharsRef reuse) {
int upto = 0;
char[] buffer = reuse.chars;
for(String word : words) {
if (upto > 0) {
if (upto >= buffer.length) {
reuse.grow(upto);
buffer = reuse.chars;
}
buffer[upto++] = SynonymMap.WORD_SEPARATOR;
}
if (currMap.synonyms != null && !mergeExisting) {
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
}
List<Token> superset = currMap.synonyms==null ? replacement :
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
currMap.synonyms = superset.toArray(new Token[superset.size()]);
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
final int wordLen = word.length();
final int needed = upto + wordLen;
if (needed > buffer.length) {
reuse.grow(needed);
buffer = reuse.chars;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("<");
if (synonyms!=null) {
sb.append("[");
for (int i=0; i<synonyms.length; i++) {
if (i!=0) sb.append(',');
sb.append(synonyms[i]);
}
if ((flags & INCLUDE_ORIG)!=0) {
sb.append(",ORIG");
}
sb.append("],");
}
sb.append(submap);
sb.append(">");
return sb.toString();
word.getChars(0, wordLen, buffer, upto);
upto += wordLen;
}
/** Produces a List<Token> from a List<String> */
public static List<Token> makeTokens(List<String> strings) {
List<Token> ret = new ArrayList<Token>(strings.size());
for (String str : strings) {
//Token newTok = new Token(str,0,0,"SYNONYM");
Token newTok = new Token(str, 0,0,"SYNONYM");
ret.add(newTok);
}
return ret;
return reuse;
}
/** Sugar: analyzes the text with the analyzer and
* separates by {@link SynonymMap#WORD_SEPARATOR}.
* reuse and its chars must not be null. */
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
TokenStream ts = analyzer.reusableTokenStream("", new StringReader(text));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
reuse.length = 0;
while (ts.incrementToken()) {
int length = termAtt.length();
if (length == 0) {
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
}
if (posIncAtt.getPositionIncrement() != 1) {
throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
}
reuse.grow(reuse.length + length + 1); /* current + word + separator */
int end = reuse.offset + reuse.length;
if (reuse.length > 0) {
reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
reuse.length++;
}
System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
reuse.length += length;
}
ts.end();
ts.close();
if (reuse.length == 0) {
throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
}
return reuse;
}
/** only used for asserting! */
private boolean hasHoles(CharsRef chars) {
final int end = chars.offset + chars.length;
for(int idx=chars.offset+1;idx<end;idx++) {
if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) {
return true;
}
}
if (chars.chars[chars.offset] == '\u0000') {
return true;
}
if (chars.chars[chars.offset + chars.length - 1] == '\u0000') {
return true;
}
return false;
}
// NOTE: while it's tempting to make this public, since
// caller's parser likely knows the
// numInput/numOutputWords, sneaky exceptions, much later
// on, will result if these values are wrong; so we always
// recompute ourselves to be safe:
private void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) {
// first convert to UTF-8
if (numInputWords <= 0) {
throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
}
if (input.length <= 0) {
throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")");
}
if (numOutputWords <= 0) {
throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
}
if (output.length <= 0) {
throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")");
}
assert !hasHoles(input): "input has holes: " + input;
assert !hasHoles(output): "output has holes: " + output;
//System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch);
// lookup in hash
int ord = words.add(utf8Scratch, hashCode);
if (ord < 0) {
// already exists in our hash
ord = (-ord)-1;
//System.out.println(" output=" + output + " old ord=" + ord);
} else {
//System.out.println(" output=" + output + " new ord=" + ord);
}
MapEntry e = workingSet.get(input);
if (e == null) {
e = new MapEntry();
workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map
}
e.ords.add(ord);
e.includeOrig |= includeOrig;
maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords);
maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords);
}
private int countWords(CharsRef chars) {
int wordCount = 1;
int upto = chars.offset;
final int limit = chars.offset + chars.length;
while(upto < limit) {
if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) {
wordCount++;
}
}
return wordCount;
}
/**
* Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
* the tokens end up at the same position.
*
* Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
* Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
* Add a phrase->phrase synonym mapping.
* Phrases are character sequences where words are
* separated with character zero (\u0000). Empty words
* (two \u0000s in a row) are not allowed in the input nor
* the output!
*
* @param input input phrase
* @param output output phrase
* @param includeOrig true if the original should be included
*/
public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
ArrayList<Token> result = new ArrayList<Token>();
if (lst1 ==null || lst2 ==null) {
if (lst2 != null) result.addAll(lst2);
if (lst1 != null) result.addAll(lst1);
return result;
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
add(input, countWords(input), output, countWords(output), includeOrig);
}
int pos=0;
Iterator<Token> iter1=lst1.iterator();
Iterator<Token> iter2=lst2.iterator();
Token tok1 = iter1.hasNext() ? iter1.next() : null;
Token tok2 = iter2.hasNext() ? iter2.next() : null;
int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0;
int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0;
while(tok1!=null || tok2!=null) {
while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
tok.copyBuffer(tok1.buffer(), 0, tok1.length());
tok.setPositionIncrement(pos1-pos);
result.add(tok);
pos=pos1;
tok1 = iter1.hasNext() ? iter1.next() : null;
pos1 += tok1!=null ? tok1.getPositionIncrement() : 0;
}
while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
tok.copyBuffer(tok2.buffer(), 0, tok2.length());
tok.setPositionIncrement(pos2-pos);
result.add(tok);
pos=pos2;
tok2 = iter2.hasNext() ? iter2.next() : null;
pos2 += tok2!=null ? tok2.getPositionIncrement() : 0;
}
}
return result;
/**
* Builds an {@link SynonymMap} and returns it.
*/
public SynonymMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
// TODO: are we using the best sharing options?
org.apache.lucene.util.fst.Builder<BytesRef> builder =
new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
BytesRef scratch = new BytesRef(64);
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
final Set<Integer> dedupSet;
if (dedup) {
dedupSet = new HashSet<Integer>();
} else {
dedupSet = null;
}
final byte[] spare = new byte[5];
Set<CharsRef> keys = workingSet.keySet();
CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
//System.out.println("fmap.build");
for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
CharsRef input = sortedKeys[keyIdx];
MapEntry output = workingSet.get(input);
int numEntries = output.ords.size();
// output size, assume the worst case
int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
scratch.grow(estimatedSize);
scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
assert scratch.offset == 0;
// now write our output data:
int count = 0;
for (int i = 0; i < numEntries; i++) {
if (dedupSet != null) {
// box once
final Integer ent = output.ords.get(i);
if (dedupSet.contains(ent)) {
continue;
}
dedupSet.add(ent);
}
scratchOutput.writeVInt(output.ords.get(i));
count++;
}
final int pos = scratchOutput.getPosition();
scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
final int pos2 = scratchOutput.getPosition();
final int vIntLen = pos2-pos;
// Move the count + includeOrig to the front of the byte[]:
System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen);
System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen);
if (dedupSet != null) {
dedupSet.clear();
}
scratch.length = scratchOutput.getPosition() - scratch.offset;
//System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
builder.add(input, new BytesRef(scratch));
}
FST<BytesRef> fst = builder.finish();
return new SynonymMap(fst, words, maxHorizontalContext);
}
}
}

View File

@ -0,0 +1,112 @@
package org.apache.lucene.analysis.synonym;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.text.ParseException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.CharsRef;
/**
* Parser for wordnet prolog format
* <p>
* See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format.
* @lucene.experimental
*/
// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
public class WordnetSynonymParser extends SynonymMap.Builder {
private final boolean expand;
private final Analyzer analyzer;
public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
super(dedup);
this.expand = expand;
this.analyzer = analyzer;
}
public void add(Reader in) throws IOException, ParseException {
LineNumberReader br = new LineNumberReader(in);
try {
String line = null;
String lastSynSetID = "";
CharsRef synset[] = new CharsRef[8];
int synsetSize = 0;
while ((line = br.readLine()) != null) {
String synSetID = line.substring(2, 11);
if (!synSetID.equals(lastSynSetID)) {
addInternal(synset, synsetSize);
synsetSize = 0;
}
if (synset.length <= synsetSize+1) {
CharsRef larger[] = new CharsRef[synset.length * 2];
System.arraycopy(synset, 0, larger, 0, synsetSize);
synset = larger;
}
synset[synsetSize] = parseSynonym(line, synset[synsetSize]);
synsetSize++;
lastSynSetID = synSetID;
}
// final synset in the file
addInternal(synset, synsetSize);
} catch (IllegalArgumentException e) {
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
ex.initCause(e);
throw ex;
} finally {
br.close();
}
}
private CharsRef parseSynonym(String line, CharsRef reuse) throws IOException {
if (reuse == null) {
reuse = new CharsRef(8);
}
int start = line.indexOf('\'')+1;
int end = line.lastIndexOf('\'');
String text = line.substring(start, end).replace("''", "'");
return analyze(analyzer, text, reuse);
}
private void addInternal(CharsRef synset[], int size) throws IOException {
if (size <= 1) {
return; // nothing to do
}
if (expand) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
add(synset[i], synset[j], false);
}
}
} else {
for (int i = 0; i < size; i++) {
add(synset[i], synset[0], false);
}
}
}
}

View File

@ -1,3 +1,4 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
@ -14,13 +15,8 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<title>
wordnet
</title>
</head>
<body>
wordnet
</body>
<html><head></head>
<body>
Analysis components for Synonyms.
</body>
</html>

View File

@ -0,0 +1,144 @@
package org.apache.lucene.analysis.synonym;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.text.ParseException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.junit.Test;
/**
* Tests parser for the Solr synonyms format
* @lucene.experimental
*/
public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
/** Tests some simple examples from the solr wiki */
public void testSimple() throws Exception {
String testFile =
"i-pod, ipod, ipoooood\n" +
"foo => foo bar\n" +
"foo => baz\n" +
"this test, that testing";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
parser.add(new StringReader(testFile));
final SynonymMap map = parser.build();
Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
}
};
assertAnalyzesTo(analyzer, "ball",
new String[] { "ball" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "i-pod",
new String[] { "i-pod", "ipod", "ipoooood" },
new int[] { 1, 0, 0 });
assertAnalyzesTo(analyzer, "foo",
new String[] { "foo", "baz", "bar" },
new int[] { 1, 0, 1 });
assertAnalyzesTo(analyzer, "this test",
new String[] { "this", "that", "test", "testing" },
new int[] { 1, 0, 1, 0 });
}
/** parse a syn file with bad syntax */
@Test(expected=ParseException.class)
public void testInvalidDoubleMap() throws Exception {
String testFile = "a => b => c";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
parser.add(new StringReader(testFile));
}
/** parse a syn file with bad syntax */
@Test(expected=ParseException.class)
public void testInvalidAnalyzesToNothingOutput() throws Exception {
String testFile = "a => 1";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
parser.add(new StringReader(testFile));
}
/** parse a syn file with bad syntax */
@Test(expected=ParseException.class)
public void testInvalidAnalyzesToNothingInput() throws Exception {
String testFile = "1 => a";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
parser.add(new StringReader(testFile));
}
/** parse a syn file with bad syntax */
@Test(expected=ParseException.class)
public void testInvalidPositionsInput() throws Exception {
String testFile = "testola => the test";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
parser.add(new StringReader(testFile));
}
/** parse a syn file with bad syntax */
@Test(expected=ParseException.class)
public void testInvalidPositionsOutput() throws Exception {
String testFile = "the test => testola";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
parser.add(new StringReader(testFile));
}
/** parse a syn file with some escaped syntax chars */
public void testEscapedStuff() throws Exception {
String testFile =
"a\\=>a => b\\=>b\n" +
"a\\,a => b\\,b";
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.KEYWORD, false));
parser.add(new StringReader(testFile));
final SynonymMap map = parser.build();
Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
}
};
assertAnalyzesTo(analyzer, "ball",
new String[] { "ball" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "a=>a",
new String[] { "b=>b" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "a,a",
new String[] { "b,b" },
new int[] { 1 });
}
}

View File

@ -0,0 +1,393 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util._TestUtil;
public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
private SynonymMap.Builder b;
private Tokenizer tokensIn;
private SynonymFilter tokensOut;
private CharTermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private OffsetAttribute offsetAtt;
private void add(String input, String output, boolean keepOrig) {
b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
new CharsRef(output.replaceAll(" +", "\u0000")),
keepOrig);
}
private void assertEquals(CharTermAttribute term, String expected) {
assertEquals(expected.length(), term.length());
final char[] buffer = term.buffer();
for(int chIDX=0;chIDX<expected.length();chIDX++) {
assertEquals(expected.charAt(chIDX), buffer[chIDX]);
}
}
private void verify(String input, String output) throws Exception {
if (VERBOSE) {
System.out.println("TEST: verify input=" + input + " expectedOutput=" + output);
}
tokensIn.reset(new StringReader(input));
tokensOut.reset();
final String[] expected = output.split(" ");
int expectedUpto = 0;
while(tokensOut.incrementToken()) {
if (VERBOSE) {
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
}
assertTrue(expectedUpto < expected.length);
final int startOffset = offsetAtt.startOffset();
final int endOffset = offsetAtt.endOffset();
final String[] expectedAtPos = expected[expectedUpto++].split("/");
for(int atPos=0;atPos<expectedAtPos.length;atPos++) {
if (atPos > 0) {
assertTrue(tokensOut.incrementToken());
if (VERBOSE) {
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
}
}
assertEquals(termAtt, expectedAtPos[atPos]);
assertEquals(atPos == 0 ? 1 : 0,
posIncrAtt.getPositionIncrement());
// start/end offset of all tokens at same pos should
// be the same:
assertEquals(startOffset, offsetAtt.startOffset());
assertEquals(endOffset, offsetAtt.endOffset());
}
}
tokensOut.end();
tokensOut.close();
if (VERBOSE) {
System.out.println(" incr: END");
}
assertEquals(expectedUpto, expected.length);
}
public void testBasic() throws Exception {
b = new SynonymMap.Builder(true);
add("a", "foo", true);
add("a b", "bar fee", true);
add("b c", "dog collar", true);
add("c d", "dog harness holder extras", true);
add("m c e", "dog barks loudly", false);
add("e f", "foo bar", false);
add("e f", "baz bee", false);
add("z", "boo", false);
add("y", "bee", true);
tokensIn = new MockTokenizer(new StringReader("a"),
MockTokenizer.WHITESPACE,
true);
tokensIn.reset();
assertTrue(tokensIn.incrementToken());
assertFalse(tokensIn.incrementToken());
tokensIn.end();
tokensIn.close();
tokensOut = new SynonymFilter(tokensIn,
b.build(),
true);
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
verify("a b c", "a/bar b/fee c");
// syn output extends beyond input tokens
verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");
verify("a b a", "a/bar b/fee a/foo");
// outputs that add to one another:
verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");
// two outputs for same input
verify("e f", "foo/baz bar/bee");
// mixed keepOrig true/false:
verify("a m c e x", "a/foo dog barks loudly x");
verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x");
assertTrue(tokensOut.getCaptureCount() > 0);
// no captureStates when no syns matched
verify("p q r s t", "p q r s t");
assertEquals(0, tokensOut.getCaptureCount());
// no captureStates when only single-input syns, w/ no
// lookahead needed, matched
verify("p q z y t", "p q boo y/bee t");
assertEquals(0, tokensOut.getCaptureCount());
}
private String getRandomString(char start, int alphabetSize, int length) {
assert alphabetSize <= 26;
char[] s = new char[2*length];
for(int charIDX=0;charIDX<length;charIDX++) {
s[2*charIDX] = (char) (start + random.nextInt(alphabetSize));
s[2*charIDX+1] = ' ';
}
return new String(s);
}
private static class OneSyn {
String in;
List<String> out;
boolean keepOrig;
}
public String slowSynMatcher(String doc, List<OneSyn> syns, int maxOutputLength) {
assertTrue(doc.length() % 2 == 0);
final int numInputs = doc.length()/2;
boolean[] keepOrigs = new boolean[numInputs];
Arrays.fill(keepOrigs, false);
String[] outputs = new String[numInputs + maxOutputLength];
OneSyn[] matches = new OneSyn[numInputs];
for(OneSyn syn : syns) {
int idx = -1;
while(true) {
idx = doc.indexOf(syn.in, 1+idx);
if (idx == -1) {
break;
}
assertTrue(idx % 2 == 0);
final int matchIDX = idx/2;
assertTrue(syn.in.length() % 2 == 1);
if (matches[matchIDX] == null) {
matches[matchIDX] = syn;
} else if (syn.in.length() > matches[matchIDX].in.length()) {
// Greedy conflict resolution: longer match wins:
matches[matchIDX] = syn;
} else {
assertTrue(syn.in.length() < matches[matchIDX].in.length());
}
}
}
// Greedy conflict resolution: if syn matches a range of inputs,
// it prevents other syns from matching that range
for(int inputIDX=0;inputIDX<numInputs;inputIDX++) {
final OneSyn match = matches[inputIDX];
if (match != null) {
final int synInLength = (1+match.in.length())/2;
for(int nextInputIDX=inputIDX+1;nextInputIDX<numInputs && nextInputIDX<(inputIDX+synInLength);nextInputIDX++) {
matches[nextInputIDX] = null;
}
}
}
// Fill overlapping outputs:
for(int inputIDX=0;inputIDX<numInputs;inputIDX++) {
final OneSyn syn = matches[inputIDX];
if (syn == null) {
continue;
}
for(String synOut : syn.out) {
final String[] synOutputs = synOut.split(" ");
assertEquals(synOutputs.length, (1+synOut.length())/2);
final int matchEnd = inputIDX + synOutputs.length;
int synUpto = 0;
for(int matchIDX=inputIDX;matchIDX<matchEnd;matchIDX++) {
if (outputs[matchIDX] == null) {
outputs[matchIDX] = synOutputs[synUpto++];
} else {
outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
}
if (matchIDX < numInputs) {
keepOrigs[matchIDX] |= syn.keepOrig;
}
}
}
}
StringBuilder sb = new StringBuilder();
String[] inputTokens = doc.split(" ");
final int limit = inputTokens.length + maxOutputLength;
for(int inputIDX=0;inputIDX<limit;inputIDX++) {
boolean posHasOutput = false;
if (inputIDX >= numInputs && outputs[inputIDX] == null) {
break;
}
if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) {
sb.append(inputTokens[inputIDX]);
posHasOutput = true;
}
if (outputs[inputIDX] != null) {
if (posHasOutput) {
sb.append('/');
}
sb.append(outputs[inputIDX]);
}
if (inputIDX < limit-1) {
sb.append(' ');
}
}
return sb.toString();
}
public void testRandom() throws Exception {
final int alphabetSize = _TestUtil.nextInt(random, 2, 7);
final int docLen = atLeast(3000);
//final int docLen = 50;
final String document = getRandomString('a', alphabetSize, docLen);
if (VERBOSE) {
System.out.println("TEST: doc=" + document);
}
final int numSyn = atLeast(5);
//final int numSyn = 2;
final Map<String,OneSyn> synMap = new HashMap<String,OneSyn>();
final List<OneSyn> syns = new ArrayList<OneSyn>();
final boolean dedup = random.nextBoolean();
if (VERBOSE) {
System.out.println(" dedup=" + dedup);
}
b = new SynonymMap.Builder(dedup);
for(int synIDX=0;synIDX<numSyn;synIDX++) {
final String synIn = getRandomString('a', alphabetSize, _TestUtil.nextInt(random, 1, 5)).trim();
OneSyn s = synMap.get(synIn);
if (s == null) {
s = new OneSyn();
s.in = synIn;
syns.add(s);
s.out = new ArrayList<String>();
synMap.put(synIn, s);
s.keepOrig = random.nextBoolean();
}
final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim();
s.out.add(synOut);
add(synIn, synOut, s.keepOrig);
if (VERBOSE) {
System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig);
}
}
tokensIn = new MockTokenizer(new StringReader("a"),
MockTokenizer.WHITESPACE,
true);
tokensIn.reset();
assertTrue(tokensIn.incrementToken());
assertFalse(tokensIn.incrementToken());
tokensIn.end();
tokensIn.close();
tokensOut = new SynonymFilter(tokensIn,
b.build(),
true);
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
if (dedup) {
pruneDups(syns);
}
final String expected = slowSynMatcher(document, syns, 5);
if (VERBOSE) {
System.out.println("TEST: expected=" + expected);
}
verify(document, expected);
}
private void pruneDups(List<OneSyn> syns) {
Set<String> seen = new HashSet<String>();
for(OneSyn syn : syns) {
int idx = 0;
while(idx < syn.out.size()) {
String out = syn.out.get(idx);
if (!seen.contains(out)) {
seen.add(out);
idx++;
} else {
syn.out.remove(idx);
}
}
seen.clear();
}
}
private String randomNonEmptyString() {
while(true) {
final String s = _TestUtil.randomUnicodeString(random).trim();
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
return s;
}
}
}
/** simple random test, doesn't verify correctness.
* does verify it doesnt throw exceptions, or that the stream doesn't misbehave
*/
public void testRandom2() throws Exception {
final int numIters = atLeast(10);
for (int i = 0; i < numIters; i++) {
b = new SynonymMap.Builder(random.nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
}
final SynonymMap map = b.build();
final boolean ignoreCase = random.nextBoolean();
final Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
}
};
checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
}
}
}

View File

@ -0,0 +1,72 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
Analyzer analyzer;
String synonymsFile =
"s(100000001,1,'woods',n,1,0).\n" +
"s(100000001,2,'wood',n,1,0).\n" +
"s(100000001,3,'forest',n,1,0).\n" +
"s(100000002,1,'wolfish',n,1,0).\n" +
"s(100000002,2,'ravenous',n,1,0).\n" +
"s(100000003,1,'king',n,1,1).\n" +
"s(100000003,2,'baron',n,1,1).\n" +
"s(100000004,1,'king''s evil',n,1,1).\n" +
"s(100000004,2,'king''s meany',n,1,1).\n";
public void testSynonyms() throws Exception {
WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random));
parser.add(new StringReader(synonymsFile));
final SynonymMap map = parser.build();
Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
}
};
/* all expansions */
assertAnalyzesTo(analyzer, "Lost in the woods",
new String[] { "Lost", "in", "the", "woods", "wood", "forest" },
new int[] { 0, 5, 8, 12, 12, 12 },
new int[] { 4, 7, 11, 17, 17, 17 },
new int[] { 1, 1, 1, 1, 0, 0 });
/* single quote */
assertAnalyzesTo(analyzer, "king",
new String[] { "king", "baron" });
/* multi words */
assertAnalyzesTo(analyzer, "king's evil",
new String[] { "king's", "king's", "evil", "meany" });
}
}

View File

@ -90,6 +90,10 @@ import org.apache.lucene.store.OutputStreamDataOutput;
*
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
* nothing else.
*
* <b>NOTE</b>: the FST file format is experimental and
* subject to suddenly change, requiring you to rebuild the
* FST suggest index.
*/
public class FSTLookup extends Lookup {

View File

@ -320,6 +320,9 @@ New Features
Optimizations
----------------------
* LUCENE-3233: Improved memory usage, build time, and performance of
SynonymFilterFactory. (Mike McCandless, Robert Muir)
Bug Fixes
----------------------

View File

@ -0,0 +1,157 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.text.ParseException;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.Version;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. this is only a backwards compatibility
* mechanism that will be removed in Lucene 5.0
*/
// NOTE: rename this to "SynonymFilterFactory" and nuke that delegator in Lucene 5.0!
@Deprecated
final class FSTSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private SynonymMap map;
private boolean ignoreCase;
@Override
public TokenStream create(TokenStream input) {
return new SynonymFilter(input, map, ignoreCase);
}
@Override
public void inform(ResourceLoader loader) {
final boolean ignoreCase = getBoolean("ignoreCase", false);
this.ignoreCase = ignoreCase;
String tf = args.get("tokenizerFactory");
final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf, args);
Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader);
TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream);
}
};
String format = args.get("format");
try {
if (format == null || format.equals("solr")) {
// TODO: expose dedup as a parameter?
map = loadSolrSynonyms(loader, true, analyzer);
} else if (format.equals("wordnet")) {
map = loadWordnetSynonyms(loader, true, analyzer);
} else {
// TODO: somehow make this more pluggable
throw new RuntimeException("Unrecognized synonyms format: " + format);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* Load synonyms from the solr format, "format=solr".
*/
private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
final boolean expand = getBoolean("expand", true);
String synonyms = args.get("synonyms");
if (synonyms == null)
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer);
File synonymFile = new File(synonyms);
if (synonymFile.exists()) {
decoder.reset();
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
} else {
List<String> files = StrUtils.splitFileNames(synonyms);
for (String file : files) {
decoder.reset();
parser.add(new InputStreamReader(loader.openResource(file), decoder));
}
}
return parser.build();
}
/**
* Load synonyms from the wordnet format, "format=wordnet".
*/
private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
final boolean expand = getBoolean("expand", true);
String synonyms = args.get("synonyms");
if (synonyms == null)
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer);
File synonymFile = new File(synonyms);
if (synonymFile.exists()) {
decoder.reset();
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
} else {
List<String> files = StrUtils.splitFileNames(synonyms);
for (String file : files) {
decoder.reset();
parser.add(new InputStreamReader(loader.openResource(file), decoder));
}
}
return parser.build();
}
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
TokenizerFactory tokFactory = (TokenizerFactory) loader.newInstance(cname);
tokFactory.init(args);
return tokFactory;
}
}

View File

@ -0,0 +1,261 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
* <p>
* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
* or discarded. If the original tokens are included, the position increments may be modified
* to retain absolute positions after merging with the synonym tokenstream.
* <p>
* Generated synonyms will start at the same position as the first matched source token.
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
*/
@Deprecated
final class SlowSynonymFilter extends TokenFilter {
private final SlowSynonymMap map; // Map<String, SynonymMap>
private Iterator<AttributeSource> replacement; // iterator over generated tokens
public SlowSynonymFilter(TokenStream in, SlowSynonymMap map) {
super(in);
if (map == null)
throw new IllegalArgumentException("map is required");
this.map = map;
// just ensuring these attributes exist...
addAttribute(CharTermAttribute.class);
addAttribute(PositionIncrementAttribute.class);
addAttribute(OffsetAttribute.class);
addAttribute(TypeAttribute.class);
}
/*
* Need to worry about multiple scenarios:
* - need to go for the longest match
* a b => foo #shouldn't match if "a b" is followed by "c d"
* a b c d => bar
* - need to backtrack - retry matches for tokens already read
* a b c d => foo
* b c => bar
* If the input stream is "a b c x", one will consume "a b c d"
* trying to match the first rule... all but "a" should be
* pushed back so a match may be made on "b c".
* - don't try and match generated tokens (thus need separate queue)
* matching is not recursive.
* - handle optional generation of original tokens in all these cases,
* merging token streams to preserve token positions.
* - preserve original positionIncrement of first matched token
*/
@Override
public boolean incrementToken() throws IOException {
while (true) {
// if there are any generated tokens, return them... don't try any
// matches against them, as we specifically don't want recursion.
if (replacement!=null && replacement.hasNext()) {
copy(this, replacement.next());
return true;
}
// common case fast-path of first token not matching anything
AttributeSource firstTok = nextTok();
if (firstTok == null) return false;
CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
if (result == null) {
copy(this, firstTok);
return true;
}
// fast-path failed, clone ourselves if needed
if (firstTok == this)
firstTok = cloneAttributes();
// OK, we matched a token, so find the longest match.
matched = new LinkedList<AttributeSource>();
result = match(result);
if (result==null) {
// no match, simply return the first token read.
copy(this, firstTok);
return true;
}
// reuse, or create new one each time?
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
//
// there was a match... let's generate the new tokens, merging
// in the matched tokens (position increments need adjusting)
//
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
boolean includeOrig = result.includeOrig();
AttributeSource origTok = includeOrig ? firstTok : null;
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
int repPos=0; // curr position in replacement token stream
int pos=0; // current position in merged token stream
for (int i=0; i<result.synonyms.length; i++) {
Token repTok = result.synonyms[i];
AttributeSource newTok = firstTok.cloneAttributes();
CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
repPos += repTok.getPositionIncrement();
if (i==0) repPos=origPos; // make position of first token equal to original
// if necessary, insert original tokens and adjust position increment
while (origTok != null && origPos <= repPos) {
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) {
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
newPosIncAtt.setPositionIncrement(repPos - pos);
generated.add(newTok);
pos += newPosIncAtt.getPositionIncrement();
}
// finish up any leftover original tokens
while (origTok!=null) {
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPosInc.setPositionIncrement(origPos-pos);
generated.add(origTok);
pos += origPosInc.getPositionIncrement();
origTok = matched.isEmpty() ? null : matched.removeFirst();
if (origTok != null) {
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
origPos += origPosInc.getPositionIncrement();
}
}
// what if we replaced a longer sequence with a shorter one?
// a/0 b/5 => foo/0
// should I re-create the gap on the next buffered token?
replacement = generated.iterator();
// Now return to the top of the loop to read and return the first
// generated token.. The reason this is done is that we may have generated
// nothing at all, and may need to continue with more matching logic.
}
}
//
// Defer creation of the buffer until the first time it is used to
// optimize short fields with no matches.
//
private LinkedList<AttributeSource> buffer;
private LinkedList<AttributeSource> matched;
private boolean exhausted;
private AttributeSource nextTok() throws IOException {
if (buffer!=null && !buffer.isEmpty()) {
return buffer.removeFirst();
} else {
if (!exhausted && input.incrementToken()) {
return this;
} else {
exhausted = true;
return null;
}
}
}
private void pushTok(AttributeSource t) {
if (buffer==null) buffer=new LinkedList<AttributeSource>();
buffer.addFirst(t);
}
private SlowSynonymMap match(SlowSynonymMap map) throws IOException {
SlowSynonymMap result = null;
if (map.submap != null) {
AttributeSource tok = nextTok();
if (tok != null) {
// clone ourselves.
if (tok == this)
tok = cloneAttributes();
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
if (subMap != null) {
// recurse
result = match(subMap);
}
if (result != null) {
matched.addFirst(tok);
} else {
// push back unmatched token
pushTok(tok);
}
}
}
// if no longer sequence matched, so if this node has synonyms, it's the match.
if (result==null && map.synonyms!=null) {
result = map;
}
return result;
}
private void copy(AttributeSource target, AttributeSource source) {
if (target != source)
source.copyTo(target);
}
@Override
public void reset() throws IOException {
input.reset();
replacement = null;
exhausted = false;
}
}

View File

@ -0,0 +1,188 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* Factory for {@link SlowSynonymFilter} (only used with luceneMatchVersion < 3.4)
* <pre class="prettyprint" >
* &lt;fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
* expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
*/
@Deprecated
final class SlowSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public void inform(ResourceLoader loader) {
String synonyms = args.get("synonyms");
if (synonyms == null)
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
boolean ignoreCase = getBoolean("ignoreCase", false);
boolean expand = getBoolean("expand", true);
String tf = args.get("tokenizerFactory");
TokenizerFactory tokFactory = null;
if( tf != null ){
tokFactory = loadTokenizerFactory( loader, tf, args );
}
Iterable<String> wlist=loadRules( synonyms, loader );
synMap = new SlowSynonymMap(ignoreCase);
parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
}
/**
* @return a list of all rules
*/
protected Iterable<String> loadRules( String synonyms, ResourceLoader loader ) {
List<String> wlist=null;
try {
File synonymFile = new File(synonyms);
if (synonymFile.exists()) {
wlist = loader.getLines(synonyms);
} else {
List<String> files = StrUtils.splitFileNames(synonyms);
wlist = new ArrayList<String>();
for (String file : files) {
List<String> lines = loader.getLines(file.trim());
wlist.addAll(lines);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return wlist;
}
private SlowSynonymMap synMap;
static void parseRules(Iterable<String> rules, SlowSynonymMap map, String mappingSep,
String synSep, boolean expansion, TokenizerFactory tokFactory) {
int count=0;
for (String rule : rules) {
// To use regexes, we need an expression that specifies an odd number of chars.
// This can't really be done with string.split(), and since we need to
// do unescaping at some point anyway, we wouldn't be saving any effort
// by using regexes.
List<String> mapping = StrUtils.splitSmart(rule, mappingSep, false);
List<List<String>> source;
List<List<String>> target;
if (mapping.size() > 2) {
throw new RuntimeException("Invalid Synonym Rule:" + rule);
} else if (mapping.size()==2) {
source = getSynList(mapping.get(0), synSep, tokFactory);
target = getSynList(mapping.get(1), synSep, tokFactory);
} else {
source = getSynList(mapping.get(0), synSep, tokFactory);
if (expansion) {
// expand to all arguments
target = source;
} else {
// reduce to first argument
target = new ArrayList<List<String>>(1);
target.add(source.get(0));
}
}
boolean includeOrig=false;
for (List<String> fromToks : source) {
count++;
for (List<String> toToks : target) {
map.add(fromToks,
SlowSynonymMap.makeTokens(toToks),
includeOrig,
true
);
}
}
}
}
// a , b c , d e f => [[a],[b,c],[d,e,f]]
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
List<String> strList = StrUtils.splitSmart(str, separator, false);
// now split on whitespace to get a list of token strings
List<List<String>> synList = new ArrayList<List<String>>();
for (String toks : strList) {
List<String> tokList = tokFactory == null ?
StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
synList.add(tokList);
}
return synList;
}
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
StringReader reader = new StringReader( source );
TokenStream ts = loadTokenizer(tokFactory, reader);
List<String> tokList = new ArrayList<String>();
try {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
while (ts.incrementToken()){
if( termAtt.length() > 0 )
tokList.add( termAtt.toString() );
}
} catch (IOException e) {
throw new RuntimeException(e);
}
finally{
reader.close();
}
return tokList;
}
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
tokFactory.init( args );
return tokFactory;
}
private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
return tokFactory.create( reader );
}
public SlowSynonymMap getSynonymMap() {
return synMap;
}
public SlowSynonymFilter create(TokenStream input) {
return new SlowSynonymFilter(input,synMap);
}
}

View File

@ -0,0 +1,162 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.util.Version;
import java.util.*;
/** Mapping rules for use with {@link SlowSynonymFilter}
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
*/
@Deprecated
class SlowSynonymMap {
/** @lucene.internal */
public CharArrayMap<SlowSynonymMap> submap; // recursive: Map<String, SynonymMap>
/** @lucene.internal */
public Token[] synonyms;
int flags;
static final int INCLUDE_ORIG=0x01;
static final int IGNORE_CASE=0x02;
public SlowSynonymMap() {}
public SlowSynonymMap(boolean ignoreCase) {
if (ignoreCase) flags |= IGNORE_CASE;
}
public boolean includeOrig() { return (flags & INCLUDE_ORIG) != 0; }
public boolean ignoreCase() { return (flags & IGNORE_CASE) != 0; }
/**
* @param singleMatch List<String>, the sequence of strings to match
* @param replacement List<Token> the list of tokens to use on a match
* @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
* @param mergeExisting merge the replacement tokens with any other mappings that exist
*/
public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
SlowSynonymMap currMap = this;
for (String str : singleMatch) {
if (currMap.submap==null) {
// for now hardcode at 4.0, as its what the old code did.
// would be nice to fix, but shouldn't store a version in each submap!!!
currMap.submap = new CharArrayMap<SlowSynonymMap>(Version.LUCENE_40, 1, ignoreCase());
}
SlowSynonymMap map = currMap.submap.get(str);
if (map==null) {
map = new SlowSynonymMap();
map.flags |= flags & IGNORE_CASE;
currMap.submap.put(str, map);
}
currMap = map;
}
if (currMap.synonyms != null && !mergeExisting) {
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
}
List<Token> superset = currMap.synonyms==null ? replacement :
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
currMap.synonyms = superset.toArray(new Token[superset.size()]);
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("<");
if (synonyms!=null) {
sb.append("[");
for (int i=0; i<synonyms.length; i++) {
if (i!=0) sb.append(',');
sb.append(synonyms[i]);
}
if ((flags & INCLUDE_ORIG)!=0) {
sb.append(",ORIG");
}
sb.append("],");
}
sb.append(submap);
sb.append(">");
return sb.toString();
}
/** Produces a List<Token> from a List<String> */
public static List<Token> makeTokens(List<String> strings) {
List<Token> ret = new ArrayList<Token>(strings.size());
for (String str : strings) {
//Token newTok = new Token(str,0,0,"SYNONYM");
Token newTok = new Token(str, 0,0,"SYNONYM");
ret.add(newTok);
}
return ret;
}
/**
* Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
* the tokens end up at the same position.
*
* Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
* Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
*
*/
public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
ArrayList<Token> result = new ArrayList<Token>();
if (lst1 ==null || lst2 ==null) {
if (lst2 != null) result.addAll(lst2);
if (lst1 != null) result.addAll(lst1);
return result;
}
int pos=0;
Iterator<Token> iter1=lst1.iterator();
Iterator<Token> iter2=lst2.iterator();
Token tok1 = iter1.hasNext() ? iter1.next() : null;
Token tok2 = iter2.hasNext() ? iter2.next() : null;
int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0;
int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0;
while(tok1!=null || tok2!=null) {
while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
tok.copyBuffer(tok1.buffer(), 0, tok1.length());
tok.setPositionIncrement(pos1-pos);
result.add(tok);
pos=pos1;
tok1 = iter1.hasNext() ? iter1.next() : null;
pos1 += tok1!=null ? tok1.getPositionIncrement() : 0;
}
while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
tok.copyBuffer(tok2.buffer(), 0, tok2.length());
tok.setPositionIncrement(pos2-pos);
result.add(tok);
pos=pos2;
tok2 = iter2.hasNext() ? iter2.next() : null;
pos2 += tok2!=null ? tok2.getPositionIncrement() : 0;
}
}
return result;
}
}

View File

@ -1,189 +1,54 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* Factory for {@link SynonymFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
* expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
* format="solr" ignoreCase="false" expand="true"
* tokenizerFactory="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
*/
public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private BaseTokenFilterFactory delegator;
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
if (luceneMatchVersion.onOrAfter(Version.LUCENE_34)) {
delegator = new FSTSynonymFilterFactory();
} else {
// check if you use the new optional arg "format". this makes no sense for the old one,
// as its wired to solr's synonyms format only.
if (args.containsKey("format") && !args.get("format").equals("solr")) {
throw new IllegalArgumentException("You must specify luceneMatchVersion >= 3.4 to use alternate synonyms formats");
}
delegator = new SlowSynonymFilterFactory();
}
delegator.init(args);
}
@Override
public TokenStream create(TokenStream input) {
assert delegator != null : "init() was not called!";
return delegator.create(input);
}
@Override
public void inform(ResourceLoader loader) {
String synonyms = args.get("synonyms");
if (synonyms == null)
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
boolean ignoreCase = getBoolean("ignoreCase", false);
boolean expand = getBoolean("expand", true);
String tf = args.get("tokenizerFactory");
TokenizerFactory tokFactory = null;
if( tf != null ){
tokFactory = loadTokenizerFactory( loader, tf, args );
}
Iterable<String> wlist=loadRules( synonyms, loader );
synMap = new SynonymMap(ignoreCase);
parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
}
/**
* @return a list of all rules
*/
protected Iterable<String> loadRules( String synonyms, ResourceLoader loader ) {
List<String> wlist=null;
try {
File synonymFile = new File(synonyms);
if (synonymFile.exists()) {
wlist = loader.getLines(synonyms);
} else {
List<String> files = StrUtils.splitFileNames(synonyms);
wlist = new ArrayList<String>();
for (String file : files) {
List<String> lines = loader.getLines(file.trim());
wlist.addAll(lines);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return wlist;
}
private SynonymMap synMap;
static void parseRules(Iterable<String> rules, SynonymMap map, String mappingSep,
String synSep, boolean expansion, TokenizerFactory tokFactory) {
int count=0;
for (String rule : rules) {
// To use regexes, we need an expression that specifies an odd number of chars.
// This can't really be done with string.split(), and since we need to
// do unescaping at some point anyway, we wouldn't be saving any effort
// by using regexes.
List<String> mapping = StrUtils.splitSmart(rule, mappingSep, false);
List<List<String>> source;
List<List<String>> target;
if (mapping.size() > 2) {
throw new RuntimeException("Invalid Synonym Rule:" + rule);
} else if (mapping.size()==2) {
source = getSynList(mapping.get(0), synSep, tokFactory);
target = getSynList(mapping.get(1), synSep, tokFactory);
} else {
source = getSynList(mapping.get(0), synSep, tokFactory);
if (expansion) {
// expand to all arguments
target = source;
} else {
// reduce to first argument
target = new ArrayList<List<String>>(1);
target.add(source.get(0));
}
}
boolean includeOrig=false;
for (List<String> fromToks : source) {
count++;
for (List<String> toToks : target) {
map.add(fromToks,
SynonymMap.makeTokens(toToks),
includeOrig,
true
);
}
}
}
}
// a , b c , d e f => [[a],[b,c],[d,e,f]]
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
List<String> strList = StrUtils.splitSmart(str, separator, false);
// now split on whitespace to get a list of token strings
List<List<String>> synList = new ArrayList<List<String>>();
for (String toks : strList) {
List<String> tokList = tokFactory == null ?
StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
synList.add(tokList);
}
return synList;
}
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
StringReader reader = new StringReader( source );
TokenStream ts = loadTokenizer(tokFactory, reader);
List<String> tokList = new ArrayList<String>();
try {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
while (ts.incrementToken()){
if( termAtt.length() > 0 )
tokList.add( termAtt.toString() );
}
} catch (IOException e) {
throw new RuntimeException(e);
}
finally{
reader.close();
}
return tokList;
}
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
tokFactory.init( args );
return tokFactory;
}
private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
return tokFactory.create( reader );
}
public SynonymMap getSynonymMap() {
return synMap;
}
public SynonymFilter create(TokenStream input) {
return new SynonymFilter(input,synMap);
assert delegator != null : "init() was not called!";
((ResourceLoaderAware) delegator).inform(loader);
}
}

View File

@ -17,30 +17,69 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.junit.Test;
import org.apache.solr.common.ResourceLoader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @since solr 1.4
*/
public class TestMultiWordSynonyms extends BaseTokenTestCase {
@Test
public void testMultiWordSynonyms() throws IOException {
/**
* @deprecated Remove this test in 5.0
*/
@Deprecated
public void testMultiWordSynonymsOld() throws IOException {
List<String> rules = new ArrayList<String>();
rules.add("a b c,d");
SynonymMap synMap = new SynonymMap(true);
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
SlowSynonymMap synMap = new SlowSynonymMap(true);
SlowSynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
SlowSynonymFilter ts = new SlowSynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
// This fails because ["e","e"] is the value of the token stream
assertTokenStreamContents(ts, new String[] { "a", "e" });
}
public void testMultiWordSynonyms() throws IOException {
SynonymFilterFactory factory = new SynonymFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.putAll(DEFAULT_VERSION_PARAM);
args.put("synonyms", "synonyms.txt");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader("a b c,d"));
TokenStream ts = factory.create(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false));
// This fails because ["e","e"] is the value of the token stream
assertTokenStreamContents(ts, new String[] { "a", "e" });
}
private class StringMockSolrResourceLoader implements ResourceLoader {
String text;
StringMockSolrResourceLoader(String text) {
this.text = text;
}
public List<String> getLines(String resource) throws IOException {
return null;
}
public Object newInstance(String cname, String... subpackages) {
return null;
}
public InputStream openResource(String resource) throws IOException {
return new ByteArrayInputStream(text.getBytes("UTF-8"));
}
}
}

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
@ -29,51 +29,52 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
/**
* @deprecated Remove this test in Lucene 5.0
*/
public class TestSynonymFilter extends BaseTokenStreamTestCase {
@Deprecated
public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
static List<String> strings(String str) {
String[] arr = str.split(" ");
return Arrays.asList(arr);
}
static void assertTokenizesTo(SynonymMap dict, String input,
static void assertTokenizesTo(SlowSynonymMap dict, String input,
String expected[]) throws IOException {
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected);
}
static void assertTokenizesTo(SynonymMap dict, String input,
static void assertTokenizesTo(SlowSynonymMap dict, String input,
String expected[], int posIncs[]) throws IOException {
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
static void assertTokenizesTo(SlowSynonymMap dict, List<Token> input,
String expected[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
static void assertTokenizesTo(SlowSynonymMap dict, List<Token> input,
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
posIncs);
}
public void testMatching() throws IOException {
SynonymMap map = new SynonymMap();
SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@ -110,7 +111,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
}
public void testIncludeOrig() throws IOException {
SynonymMap map = new SynonymMap();
SlowSynonymMap map = new SlowSynonymMap();
boolean orig = true;
boolean merge = true;
@ -167,7 +168,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testMapMerge() throws IOException {
SynonymMap map = new SynonymMap();
SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@ -206,7 +207,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testOverlap() throws IOException {
SynonymMap map = new SynonymMap();
SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@ -229,7 +230,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
}
public void testPositionIncrements() throws IOException {
SynonymMap map = new SynonymMap();
SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;
@ -264,7 +265,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
public void testPositionIncrementsWithOrig() throws IOException {
SynonymMap map = new SynonymMap();
SlowSynonymMap map = new SlowSynonymMap();
boolean orig = true;
boolean merge = true;
@ -304,7 +305,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
// x=>y
// analysing "a x" causes "y" to have a bad offset (end less than start)
// SOLR-167
SynonymMap map = new SynonymMap();
SlowSynonymMap map = new SlowSynonymMap();
boolean orig = false;
boolean merge = true;

View File

@ -0,0 +1,62 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.util.Version;
import org.apache.solr.core.SolrResourceLoader;
public class TestSynonymFilterFactory extends BaseTokenTestCase {
/** test that we can parse and use the solr syn file */
public void testSynonyms() throws Exception {
SynonymFilterFactory factory = new SynonymFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.putAll(DEFAULT_VERSION_PARAM);
args.put("synonyms", "synonyms.txt");
factory.init(args);
factory.inform(new SolrResourceLoader(null, null));
TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
assertTrue(ts instanceof SynonymFilter);
assertTokenStreamContents(ts,
new String[] { "GB", "gib", "gigabyte", "gigabytes" },
new int[] { 1, 0, 0, 0 });
}
/** test that we can parse and use the solr syn file, with the old impl
* @deprecated Remove this test in Lucene 5.0 */
@Deprecated
public void testSynonymsOld() throws Exception {
SynonymFilterFactory factory = new SynonymFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("luceneMatchVersion", Version.LUCENE_33.toString());
args.put("synonyms", "synonyms.txt");
factory.init(args);
factory.inform(new SolrResourceLoader(null, null));
TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
assertTrue(ts instanceof SlowSynonymFilter);
assertTokenStreamContents(ts,
new String[] { "GB", "gib", "gigabyte", "gigabytes" },
new int[] { 1, 0, 0, 0 });
}
}

View File

@ -25,32 +25,35 @@ import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.common.ResourceLoader;
/**
* @deprecated Remove this test in Lucene 5.0
*/
@Deprecated
public class TestSynonymMap extends LuceneTestCase {
public void testInvalidMappingRules() throws Exception {
SynonymMap synMap = new SynonymMap( true );
SlowSynonymMap synMap = new SlowSynonymMap( true );
List<String> rules = new ArrayList<String>( 1 );
rules.add( "a=>b=>c" );
try{
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
fail( "RuntimeException must be thrown." );
}
catch( RuntimeException expected ){}
}
public void testReadMappingRules() throws Exception {
SynonymMap synMap;
SlowSynonymMap synMap;
// (a)->[b]
List<String> rules = new ArrayList<String>();
rules.add( "a=>b" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "b" );
@ -58,8 +61,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (b)->[c]
rules.clear();
rules.add( "a,b=>c" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "c" );
assertTokIncludes( synMap, "b", "c" );
@ -67,8 +70,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (a)->[b][c]
rules.clear();
rules.add( "a=>b,c" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "b" );
assertTokIncludes( synMap, "a", "c" );
@ -78,8 +81,8 @@ public class TestSynonymMap extends LuceneTestCase {
rules.clear();
rules.add( "a=>a1" );
rules.add( "a b=>a2" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a1" );
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
@ -92,8 +95,8 @@ public class TestSynonymMap extends LuceneTestCase {
rules.add( "a=>a1" );
rules.add( "a b=>a2" );
rules.add( "a c=>a3" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 1, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a1" );
assertEquals( 2, getSubSynonymMap( synMap, "a" ).submap.size() );
@ -109,8 +112,8 @@ public class TestSynonymMap extends LuceneTestCase {
rules.add( "a b=>a2" );
rules.add( "b=>b1" );
rules.add( "b c=>b2" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a1" );
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
@ -121,14 +124,14 @@ public class TestSynonymMap extends LuceneTestCase {
}
public void testRead1waySynonymRules() throws Exception {
SynonymMap synMap;
SlowSynonymMap synMap;
// (a)->[a]
// (b)->[a]
List<String> rules = new ArrayList<String>();
rules.add( "a,b" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "b", "a" );
@ -138,8 +141,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (c)->[a]
rules.clear();
rules.add( "a,b,c" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 3, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "b", "a" );
@ -149,8 +152,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (b1)->(b2)->[a]
rules.clear();
rules.add( "a,b1 b2" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() );
@ -160,8 +163,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (b)->[a1][a2]
rules.clear();
rules.add( "a1 a2,b" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
assertEquals( 2, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
@ -171,14 +174,14 @@ public class TestSynonymMap extends LuceneTestCase {
}
public void testRead2waySynonymRules() throws Exception {
SynonymMap synMap;
SlowSynonymMap synMap;
// (a)->[a][b]
// (b)->[a][b]
List<String> rules = new ArrayList<String>();
rules.add( "a,b" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b" );
@ -190,8 +193,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (c)->[a][b][c]
rules.clear();
rules.add( "a,b,c" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 3, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b" );
@ -209,8 +212,8 @@ public class TestSynonymMap extends LuceneTestCase {
// [b1][b2]
rules.clear();
rules.add( "a,b1 b2" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b1" );
@ -226,8 +229,8 @@ public class TestSynonymMap extends LuceneTestCase {
// [b]
rules.clear();
rules.add( "a1 a2,b" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
assertEquals( 2, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
@ -239,7 +242,7 @@ public class TestSynonymMap extends LuceneTestCase {
}
public void testBigramTokenizer() throws Exception {
SynonymMap synMap;
SlowSynonymMap synMap;
// prepare bi-gram tokenizer factory
BaseTokenizerFactory tf = new NGramTokenizerFactory();
@ -251,8 +254,8 @@ public class TestSynonymMap extends LuceneTestCase {
// (ab)->(bc)->(cd)->[ef][fg][gh]
List<String> rules = new ArrayList<String>();
rules.add( "abcd=>efgh" );
synMap = new SynonymMap( true );
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
synMap = new SlowSynonymMap( true );
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
assertEquals( 1, synMap.submap.size() );
assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
@ -265,7 +268,7 @@ public class TestSynonymMap extends LuceneTestCase {
public void testLoadRules() throws Exception {
Map<String, String> args = new HashMap<String, String>();
args.put( "synonyms", "something.txt" );
SynonymFilterFactory ff = new SynonymFilterFactory();
SlowSynonymFilterFactory ff = new SlowSynonymFilterFactory();
ff.init(args);
ff.inform( new ResourceLoader() {
@Override
@ -289,7 +292,7 @@ public class TestSynonymMap extends LuceneTestCase {
}
});
SynonymMap synMap = ff.getSynonymMap();
SlowSynonymMap synMap = ff.getSynonymMap();
assertEquals( 2, synMap.submap.size() );
assertTokIncludes( synMap, "a", "a" );
assertTokIncludes( synMap, "a", "b" );
@ -298,7 +301,7 @@ public class TestSynonymMap extends LuceneTestCase {
}
private void assertTokIncludes( SynonymMap map, String src, String exp ) throws Exception {
private void assertTokIncludes( SlowSynonymMap map, String src, String exp ) throws Exception {
Token[] tokens = map.submap.get( src ).synonyms;
boolean inc = false;
for( Token token : tokens ){
@ -308,7 +311,7 @@ public class TestSynonymMap extends LuceneTestCase {
assertTrue( inc );
}
private SynonymMap getSubSynonymMap( SynonymMap map, String src ){
private SlowSynonymMap getSubSynonymMap( SlowSynonymMap map, String src ){
return map.submap.get( src );
}
}