mirror of https://github.com/apache/lucene.git
LUCENE-3233: improve ram/perf of SynonymFilter, add wordnet parsing, nuke contrib/wordnet
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145158 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
19fd2508c6
commit
015ecfa0a0
|
@ -230,7 +230,6 @@
|
|||
<packageset dir="contrib/misc/src/java"/>
|
||||
<packageset dir="contrib/queries/src/java"/>
|
||||
<packageset dir="contrib/spatial/src/java"/>
|
||||
<packageset dir="contrib/wordnet/src/java"/>
|
||||
<packageset dir="contrib/xml-query-parser/src/java"/>
|
||||
<packageset dir="contrib/queryparser/src/java"/>
|
||||
<!-- end alpha sort -->
|
||||
|
@ -250,7 +249,6 @@
|
|||
<group title="contrib: Queries" packages="org.apache.lucene.search.similar*:org.apache.lucene.search.regex*:org.apache.regexp*"/>
|
||||
<group title="contrib: Query Parser" packages="org.apache.lucene.queryParser.*"/>
|
||||
<group title="contrib: Spatial" packages="org.apache.lucene.spatial*"/>
|
||||
<group title="contrib: WordNet" packages="org.apache.lucene.wordnet*"/>
|
||||
<group title="contrib: XML Query Parser" packages="org.apache.lucene.xmlparser*"/>
|
||||
|
||||
</sources>
|
||||
|
|
|
@ -5,11 +5,6 @@ http://s.apache.org/luceneversions
|
|||
|
||||
======================= Trunk (not yet released) =======================
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
* LUCENE-3250: Wordnet's SynExpand requires a non-null Analyzer (it no longer
|
||||
treats null as StandardAnalyzer). (Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2845: Moved contrib/benchmark to modules.
|
||||
|
@ -78,6 +73,10 @@ New Features
|
|||
documents must be indexed as a document block, using
|
||||
IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless)
|
||||
|
||||
* LUCENE-3233: Added SynonymFilter for applying multi-word synonyms
|
||||
during indexing or querying (with parsers for wordnet and solr formats).
|
||||
Removed contrib/wordnet. (Robert Muir, Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
Bug Fixes
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
As of 2002-11-13 WordNet Lucene contribution contains a single Java class:
|
||||
org.apache.lucene.wordnet.Syns2Index.
|
||||
|
||||
This class creates a Lucene index with synonyms for English words from
|
||||
a Prolog file, which is a part of WordNet database.
|
|
@ -1,70 +0,0 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="wordnet" default="default">
|
||||
|
||||
<description>
|
||||
WordNet
|
||||
</description>
|
||||
|
||||
<property name="prolog.file" location="prologwn/wn_s.pl"/>
|
||||
<property name="synindex.dir" location="index"/>
|
||||
|
||||
<available property="synindex.exists" file="${synindex.dir}" type="dir"/>
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
|
||||
<target name="index" depends="compile" description="Build WordNet index">
|
||||
<fail if="synindex.exists">
|
||||
Index already exists - must remove first.
|
||||
</fail>
|
||||
|
||||
<java classname="org.apache.lucene.wordnet.Syns2Index">
|
||||
<classpath>
|
||||
<path refid="compile.classpath"/>
|
||||
<pathelement location="${build.dir}/classes"/>
|
||||
</classpath>
|
||||
|
||||
<arg file="${prolog.file}"/>
|
||||
<arg file="${synindex.dir}"/>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
|
||||
<target name="synonym" description="Find synonyms for word">
|
||||
<fail unless="synindex.exists">
|
||||
Index does not exist.
|
||||
</fail>
|
||||
|
||||
<fail unless="word">
|
||||
Must specify 'word' property.
|
||||
</fail>
|
||||
|
||||
<java classname="org.apache.lucene.wordnet.SynLookup">
|
||||
<classpath>
|
||||
<path refid="compile.classpath"/>
|
||||
<pathelement location="${build.dir}/classes"/>
|
||||
</classpath>
|
||||
|
||||
<arg file="${synindex.dir}"/>
|
||||
<arg value="${word}"/>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
</project>
|
|
@ -1,142 +0,0 @@
|
|||
package org.apache.lucene.wordnet;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
|
||||
|
||||
/**
|
||||
* Expand a query by looking up synonyms for every term.
|
||||
* You need to invoke {@link Syns2Index} first to build the synonym index.
|
||||
*
|
||||
* @see Syns2Index
|
||||
*/
|
||||
public final class SynExpand {
|
||||
|
||||
/**
|
||||
* Perform synonym expansion on a query.
|
||||
*
|
||||
* @param query users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser.
|
||||
*
|
||||
* @param syns a opened to the Lucene index you previously created with {@link Syns2Index}. The searcher is not closed or otherwise altered.
|
||||
*
|
||||
* @param a analyzer used to parse the users query.
|
||||
*
|
||||
* @param f optional field name to search in or null if you want the default of "contents"
|
||||
*
|
||||
* @param boost optional boost applied to synonyms else no boost is applied
|
||||
*
|
||||
* @return the expanded Query
|
||||
*/
|
||||
public static Query expand( String query,
|
||||
IndexSearcher syns,
|
||||
Analyzer a,
|
||||
String f,
|
||||
final float boost)
|
||||
throws IOException
|
||||
{
|
||||
final Set<String> already = new HashSet<String>(); // avoid dups
|
||||
List<String> top = new LinkedList<String>(); // needs to be separately listed..
|
||||
final String field = ( f == null) ? "contents" : f;
|
||||
|
||||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||
TokenStream ts = a.reusableTokenStream( field, new StringReader( query));
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
String word = termAtt.toString();
|
||||
if ( already.add( word))
|
||||
top.add( word);
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
final BooleanQuery tmp = new BooleanQuery();
|
||||
|
||||
// [2] form query
|
||||
Iterator<String> it = top.iterator();
|
||||
while ( it.hasNext())
|
||||
{
|
||||
// [2a] add to level words in
|
||||
String word = it.next();
|
||||
TermQuery tq = new TermQuery( new Term( field, word));
|
||||
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
||||
|
||||
syns.search(new TermQuery( new Term(Syns2Index.F_WORD, word)), new Collector() {
|
||||
IndexReader reader;
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
Document d = reader.document(doc);
|
||||
String[] values = d.getValues( Syns2Index.F_SYN);
|
||||
for ( int j = 0; j < values.length; j++)
|
||||
{
|
||||
String syn = values[ j];
|
||||
if ( already.add( syn)) // avoid dups of top level words and synonyms
|
||||
{
|
||||
TermQuery tq = new TermQuery( new Term( field, syn));
|
||||
if ( boost > 0) // else keep normal 1.0
|
||||
tq.setBoost( boost);
|
||||
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(AtomicReaderContext context)
|
||||
throws IOException {
|
||||
this.reader = context.reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {}
|
||||
});
|
||||
|
||||
// [2b] add in unique synonums
|
||||
}
|
||||
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,170 +0,0 @@
|
|||
package org.apache.lucene.wordnet;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TotalHitCountCollector;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
|
||||
|
||||
/**
|
||||
* Test program to look up synonyms.
|
||||
*/
|
||||
public class SynLookup {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
if (args.length != 2) {
|
||||
System.out.println(
|
||||
"java org.apache.lucene.wordnet.SynLookup <index path> <word>");
|
||||
}
|
||||
|
||||
FSDirectory directory = FSDirectory.open(new File(args[0]));
|
||||
IndexSearcher searcher = new IndexSearcher(directory, true);
|
||||
|
||||
String word = args[1];
|
||||
Query query = new TermQuery(new Term(Syns2Index.F_WORD, word));
|
||||
TotalHitCountCollector countingCollector = new TotalHitCountCollector();
|
||||
searcher.search(query, countingCollector);
|
||||
|
||||
if (countingCollector.getTotalHits() == 0) {
|
||||
System.out.println("No synonyms found for " + word);
|
||||
} else {
|
||||
System.out.println("Synonyms found for \"" + word + "\":");
|
||||
}
|
||||
|
||||
ScoreDoc[] hits = searcher.search(query, countingCollector.getTotalHits()).scoreDocs;
|
||||
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
Document doc = searcher.doc(hits[i].doc);
|
||||
|
||||
String[] values = doc.getValues(Syns2Index.F_SYN);
|
||||
|
||||
for (int j = 0; j < values.length; j++) {
|
||||
System.out.println(values[j]);
|
||||
}
|
||||
}
|
||||
|
||||
searcher.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Perform synonym expansion on a query.
|
||||
*
|
||||
* @param query
|
||||
* @param syns
|
||||
* @param a
|
||||
* @param field
|
||||
* @param boost
|
||||
*/
|
||||
public static Query expand( String query,
|
||||
IndexSearcher syns,
|
||||
Analyzer a,
|
||||
final String field,
|
||||
final float boost)
|
||||
throws IOException
|
||||
{
|
||||
final Set<String> already = new HashSet<String>(); // avoid dups
|
||||
List<String> top = new LinkedList<String>(); // needs to be separately listed..
|
||||
|
||||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||
TokenStream ts = a.reusableTokenStream( field, new StringReader( query));
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
while (ts.incrementToken()) {
|
||||
String word = termAtt.toString();
|
||||
if ( already.add( word))
|
||||
top.add( word);
|
||||
}
|
||||
final BooleanQuery tmp = new BooleanQuery();
|
||||
|
||||
// [2] form query
|
||||
Iterator<String> it = top.iterator();
|
||||
while ( it.hasNext())
|
||||
{
|
||||
// [2a] add to level words in
|
||||
String word = it.next();
|
||||
TermQuery tq = new TermQuery( new Term( field, word));
|
||||
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
||||
|
||||
// [2b] add in unique synonums
|
||||
syns.search(new TermQuery( new Term(Syns2Index.F_WORD, word)), new Collector() {
|
||||
IndexReader reader;
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
Document d = reader.document(doc);
|
||||
String[] values = d.getValues( Syns2Index.F_SYN);
|
||||
for ( int j = 0; j < values.length; j++)
|
||||
{
|
||||
String syn = values[ j];
|
||||
if ( already.add( syn))
|
||||
{
|
||||
TermQuery tq = new TermQuery( new Term( field, syn));
|
||||
if ( boost > 0) // else keep normal 1.0
|
||||
tq.setBoost( boost);
|
||||
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(AtomicReaderContext context)
|
||||
throws IOException {
|
||||
this.reader = context.reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,400 +0,0 @@
|
|||
package org.apache.lucene.wordnet;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Loads the <a target="_blank"
|
||||
* href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a
|
||||
* href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a>
|
||||
* into a thread-safe main-memory hash map that can be used for fast
|
||||
* high-frequency lookups of synonyms for any given (lowercase) word string.
|
||||
* <p>
|
||||
* There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
|
||||
* There does not necessarily hold: A -> B, B -> C then A -> C.
|
||||
* <p>
|
||||
* Loading typically takes some 1.5 secs, so should be done only once per
|
||||
* (server) program execution, using a singleton pattern. Once loaded, a
|
||||
* synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
|
||||
* A loaded default synonym map consumes about 10 MB main memory.
|
||||
* An instance is immutable, hence thread-safe.
|
||||
* <p>
|
||||
* This implementation borrows some ideas from the Lucene Syns2Index demo that
|
||||
* Dave Spencer originally contributed to Lucene. Dave's approach
|
||||
* involved a persistent Lucene index which is suitable for occasional
|
||||
* lookups or very large synonym tables, but considered unsuitable for
|
||||
* high-frequency lookups of medium size synonym tables.
|
||||
* <p>
|
||||
* Example Usage:
|
||||
* <pre class="prettyprint">
|
||||
* String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
|
||||
* SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
|
||||
* for (int i = 0; i < words.length; i++) {
|
||||
* String[] synonyms = map.getSynonyms(words[i]);
|
||||
* System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
|
||||
* }
|
||||
* </pre>
|
||||
* <b/>
|
||||
* Example output:
|
||||
* <pre class="prettyprint">
|
||||
* hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
|
||||
* woods:[forest, wood]
|
||||
* forest:[afforest, timber, timberland, wood, woodland, woods]
|
||||
* wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
|
||||
* xxxx:[]
|
||||
* </pre>
|
||||
*
|
||||
* <p>
|
||||
* <b>See also:</b><br>
|
||||
* <a target="_blank"
|
||||
* href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb
|
||||
* man page </a><br>
|
||||
* <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a>
|
||||
*/
|
||||
public class SynonymMap {
|
||||
|
||||
/** the index data; Map<String word, String[] synonyms> */
|
||||
private final HashMap<String,String[]> table;
|
||||
|
||||
private static final String[] EMPTY = new String[0];
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
/**
|
||||
* Constructs an instance, loading WordNet synonym data from the given input
|
||||
* stream. Finally closes the stream. The words in the stream must be in
|
||||
* UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
|
||||
*
|
||||
* @param input
|
||||
* the stream to read from (null indicates an empty synonym map)
|
||||
* @throws IOException
|
||||
* if an error occured while reading the stream.
|
||||
*/
|
||||
public SynonymMap(InputStream input) throws IOException {
|
||||
this.table = input == null ? new HashMap<String,String[]>(0) : read(toByteArray(input));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the synonym set for the given word, sorted ascending.
|
||||
*
|
||||
* @param word
|
||||
* the word to lookup (must be in lowercase).
|
||||
* @return the synonyms; a set of zero or more words, sorted ascending, each
|
||||
* word containing lowercase characters that satisfy
|
||||
* <code>Character.isLetter()</code>.
|
||||
*/
|
||||
public String[] getSynonyms(String word) {
|
||||
String[] synonyms = table.get(word);
|
||||
if (synonyms == null) return EMPTY;
|
||||
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
|
||||
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
|
||||
return copy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a String representation of the index data for debugging purposes.
|
||||
*
|
||||
* @return a String representation
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder buf = new StringBuilder();
|
||||
Iterator<String> iter = new TreeMap<String,String[]>(table).keySet().iterator();
|
||||
int count = 0;
|
||||
int f0 = 0;
|
||||
int f1 = 0;
|
||||
int f2 = 0;
|
||||
int f3 = 0;
|
||||
|
||||
while (iter.hasNext()) {
|
||||
String word = iter.next();
|
||||
buf.append(word + ":");
|
||||
String[] synonyms = getSynonyms(word);
|
||||
buf.append(Arrays.asList(synonyms));
|
||||
buf.append("\n");
|
||||
count += synonyms.length;
|
||||
if (synonyms.length == 0) f0++;
|
||||
if (synonyms.length == 1) f1++;
|
||||
if (synonyms.length == 2) f2++;
|
||||
if (synonyms.length == 3) f3++;
|
||||
}
|
||||
|
||||
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyzes/transforms the given word on input stream loading. This default implementation simply
|
||||
* lowercases the word. Override this method with a custom stemming
|
||||
* algorithm or similar, if desired.
|
||||
*
|
||||
* @param word
|
||||
* the word to analyze
|
||||
* @return the same word, or a different word (or null to indicate that the
|
||||
* word should be ignored)
|
||||
*/
|
||||
protected String analyze(String word) {
|
||||
return word.toLowerCase();
|
||||
}
|
||||
|
||||
protected boolean isValid(String str) {
|
||||
for (int i=str.length(); --i >= 0; ) {
|
||||
if (!Character.isLetter(str.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private HashMap<String,String[]> read(byte[] data) {
|
||||
int WORDS = (int) (76401 / 0.7); // presizing
|
||||
int GROUPS = (int) (88022 / 0.7); // presizing
|
||||
HashMap<String,ArrayList<Integer>> word2Groups = new HashMap<String,ArrayList<Integer>>(WORDS); // Map<String word, int[] groups>
|
||||
HashMap<Integer,ArrayList<String>> group2Words = new HashMap<Integer,ArrayList<String>>(GROUPS); // Map<int group, String[] words>
|
||||
HashMap<String,String> internedWords = new HashMap<String,String>(WORDS);// Map<String word, String word>
|
||||
|
||||
Charset charset = Charset.forName("UTF-8");
|
||||
int lastNum = -1;
|
||||
Integer lastGroup = null;
|
||||
int len = data.length;
|
||||
int i=0;
|
||||
|
||||
while (i < len) { // until EOF
|
||||
/* Part A: Parse a line */
|
||||
|
||||
// scan to beginning of group
|
||||
while (i < len && data[i] != '(') i++;
|
||||
if (i >= len) break; // EOF
|
||||
i++;
|
||||
|
||||
// parse group
|
||||
int num = 0;
|
||||
while (i < len && data[i] != ',') {
|
||||
num = 10*num + (data[i] - 48);
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
// if (DEBUG) System.err.println("num="+ num);
|
||||
|
||||
// scan to beginning of word
|
||||
while (i < len && data[i] != '\'') i++;
|
||||
i++;
|
||||
|
||||
// scan to end of word
|
||||
int start = i;
|
||||
do {
|
||||
while (i < len && data[i] != '\'') i++;
|
||||
i++;
|
||||
} while (i < len && data[i] != ','); // word must end with "',"
|
||||
|
||||
if (i >= len) break; // EOF
|
||||
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
|
||||
// String word = new String(data, 0, start, i-start-1); // ASCII
|
||||
|
||||
/*
|
||||
* Part B: ignore phrases (with spaces and hyphens) and
|
||||
* non-alphabetic words, and let user customize word (e.g. do some
|
||||
* stemming)
|
||||
*/
|
||||
if (!isValid(word)) continue; // ignore
|
||||
word = analyze(word);
|
||||
if (word == null || word.length() == 0) continue; // ignore
|
||||
|
||||
|
||||
/* Part C: Add (group,word) to tables */
|
||||
|
||||
// ensure compact string representation, minimizing memory overhead
|
||||
String w = internedWords.get(word);
|
||||
if (w == null) {
|
||||
word = new String(word); // ensure compact string
|
||||
internedWords.put(word, word);
|
||||
} else {
|
||||
word = w;
|
||||
}
|
||||
|
||||
Integer group = lastGroup;
|
||||
if (num != lastNum) {
|
||||
group = Integer.valueOf(num);
|
||||
lastGroup = group;
|
||||
lastNum = num;
|
||||
}
|
||||
|
||||
// add word --> group
|
||||
ArrayList<Integer> groups = word2Groups.get(word);
|
||||
if (groups == null) {
|
||||
groups = new ArrayList<Integer>(1);
|
||||
word2Groups.put(word, groups);
|
||||
}
|
||||
groups.add(group);
|
||||
|
||||
// add group --> word
|
||||
ArrayList<String> words = group2Words.get(group);
|
||||
if (words == null) {
|
||||
words = new ArrayList<String>(1);
|
||||
group2Words.put(group, words);
|
||||
}
|
||||
words.add(word);
|
||||
}
|
||||
|
||||
|
||||
/* Part D: compute index data structure */
|
||||
HashMap<String,String[]> word2Syns = createIndex(word2Groups, group2Words);
|
||||
|
||||
/* Part E: minimize memory consumption by a factor 3 (or so) */
|
||||
// if (true) return word2Syns;
|
||||
word2Groups = null; // help gc
|
||||
//TODO: word2Groups.clear(); would be more appropriate ?
|
||||
group2Words = null; // help gc
|
||||
//TODO: group2Words.clear(); would be more appropriate ?
|
||||
|
||||
return optimize(word2Syns, internedWords);
|
||||
}
|
||||
|
||||
private HashMap<String,String[]> createIndex(Map<String,ArrayList<Integer>> word2Groups, Map<Integer,ArrayList<String>> group2Words) {
|
||||
HashMap<String,String[]> word2Syns = new HashMap<String,String[]>();
|
||||
|
||||
for (final Map.Entry<String,ArrayList<Integer>> entry : word2Groups.entrySet()) { // for each word
|
||||
ArrayList<Integer> group = entry.getValue();
|
||||
String word = entry.getKey();
|
||||
|
||||
// HashSet synonyms = new HashSet();
|
||||
TreeSet<String> synonyms = new TreeSet<String>();
|
||||
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
|
||||
ArrayList<String> words = group2Words.get(group.get(i));
|
||||
for (int j=words.size(); --j >= 0; ) { // add all words
|
||||
String synonym = words.get(j); // note that w and word are interned
|
||||
if (synonym != word) { // a word is implicitly it's own synonym
|
||||
synonyms.add(synonym);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int size = synonyms.size();
|
||||
if (size > 0) {
|
||||
String[] syns = new String[size];
|
||||
if (size == 1)
|
||||
syns[0] = synonyms.first();
|
||||
else
|
||||
synonyms.toArray(syns);
|
||||
// if (syns.length > 1) Arrays.sort(syns);
|
||||
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
|
||||
word2Syns.put(word, syns);
|
||||
}
|
||||
}
|
||||
|
||||
return word2Syns;
|
||||
}
|
||||
|
||||
private HashMap<String,String[]> optimize(HashMap<String,String[]> word2Syns, HashMap<String,String> internedWords) {
|
||||
if (DEBUG) {
|
||||
System.err.println("before gc");
|
||||
for (int i=0; i < 10; i++) System.gc();
|
||||
System.err.println("after gc");
|
||||
}
|
||||
|
||||
// collect entries
|
||||
int len = 0;
|
||||
int size = word2Syns.size();
|
||||
String[][] allSynonyms = new String[size][];
|
||||
String[] words = new String[size];
|
||||
Iterator<Map.Entry<String,String[]>> iter = word2Syns.entrySet().iterator();
|
||||
for (int j=0; j < size; j++) {
|
||||
Map.Entry<String,String[]> entry = iter.next();
|
||||
allSynonyms[j] = entry.getValue();
|
||||
words[j] = entry.getKey();
|
||||
len += words[j].length();
|
||||
}
|
||||
|
||||
// assemble large string containing all words
|
||||
StringBuilder buf = new StringBuilder(len);
|
||||
for (int j=0; j < size; j++) buf.append(words[j]);
|
||||
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
|
||||
buf = null;
|
||||
|
||||
// intern words at app level via memory-overlaid substrings
|
||||
for (int p=0, j=0; j < size; j++) {
|
||||
String word = words[j];
|
||||
internedWords.put(word, allWords.substring(p, p + word.length()));
|
||||
p += word.length();
|
||||
}
|
||||
|
||||
// replace words with interned words
|
||||
for (int j=0; j < size; j++) {
|
||||
String[] syns = allSynonyms[j];
|
||||
for (int k=syns.length; --k >= 0; ) {
|
||||
syns[k] = internedWords.get(syns[k]);
|
||||
}
|
||||
word2Syns.remove(words[j]);
|
||||
word2Syns.put(internedWords.get(words[j]), syns);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
words = null;
|
||||
allSynonyms = null;
|
||||
internedWords = null;
|
||||
allWords = null;
|
||||
System.err.println("before gc");
|
||||
for (int i=0; i < 10; i++) System.gc();
|
||||
System.err.println("after gc");
|
||||
}
|
||||
return word2Syns;
|
||||
}
|
||||
|
||||
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
|
||||
private static byte[] toByteArray(InputStream input) throws IOException {
|
||||
try {
|
||||
// safe and fast even if input.available() behaves weird or buggy
|
||||
int len = Math.max(256, input.available());
|
||||
byte[] buffer = new byte[len];
|
||||
byte[] output = new byte[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
|
||||
if (len == output.length) return output;
|
||||
buffer = null; // help gc
|
||||
buffer = new byte[len];
|
||||
System.arraycopy(output, 0, buffer, 0, len);
|
||||
return buffer;
|
||||
} finally {
|
||||
input.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,148 +0,0 @@
|
|||
package org.apache.lucene.wordnet;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Injects additional tokens for synonyms of token terms fetched from the
|
||||
* underlying child stream; the child stream must deliver lowercase tokens
|
||||
* for synonyms to be found.
|
||||
*
|
||||
*/
|
||||
public class SynonymTokenFilter extends TokenFilter {
|
||||
|
||||
/** The Token.type used to indicate a synonym to higher level filters. */
|
||||
public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
|
||||
|
||||
private final SynonymMap synonyms;
|
||||
private final int maxSynonyms;
|
||||
|
||||
private String[] stack = null;
|
||||
private int index = 0;
|
||||
private AttributeSource.State current = null;
|
||||
private int todo = 0;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates an instance for the given underlying stream and synonym table.
|
||||
*
|
||||
* @param input
|
||||
* the underlying child token stream
|
||||
* @param synonyms
|
||||
* the map used to extract synonyms for terms
|
||||
* @param maxSynonyms
|
||||
* the maximum number of synonym tokens to return per underlying
|
||||
* token word (a value of Integer.MAX_VALUE indicates unlimited)
|
||||
*/
|
||||
public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
|
||||
super(input);
|
||||
if (input == null)
|
||||
throw new IllegalArgumentException("input must not be null");
|
||||
if (synonyms == null)
|
||||
throw new IllegalArgumentException("synonyms must not be null");
|
||||
if (maxSynonyms < 0)
|
||||
throw new IllegalArgumentException("maxSynonyms must not be negative");
|
||||
|
||||
this.synonyms = synonyms;
|
||||
this.maxSynonyms = maxSynonyms;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
while (todo > 0 && index < stack.length) { // pop from stack
|
||||
if (createToken(stack[index++], current)) {
|
||||
todo--;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!input.incrementToken()) return false; // EOS; iterator exhausted
|
||||
|
||||
stack = synonyms.getSynonyms(termAtt.toString()); // push onto stack
|
||||
if (stack.length > maxSynonyms) randomize(stack);
|
||||
index = 0;
|
||||
current = captureState();
|
||||
todo = maxSynonyms;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates and returns a token for the given synonym of the current input
|
||||
* token; Override for custom (stateless or stateful) behavior, if desired.
|
||||
*
|
||||
* @param synonym
|
||||
* a synonym for the current token's term
|
||||
* @param current
|
||||
* the current token from the underlying child stream
|
||||
* @return a new token, or null to indicate that the given synonym should be
|
||||
* ignored
|
||||
*/
|
||||
protected boolean createToken(String synonym, AttributeSource.State current) {
|
||||
restoreState(current);
|
||||
termAtt.setEmpty().append(synonym);
|
||||
typeAtt.setType(SYNONYM_TOKEN_TYPE);
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Randomize synonyms to later sample a subset. Uses constant random seed
|
||||
* for reproducibility. Uses "DRand", a simple, fast, uniform pseudo-random
|
||||
* number generator with medium statistical quality (multiplicative
|
||||
* congruential method), producing integers in the range [Integer.MIN_VALUE,
|
||||
* Integer.MAX_VALUE].
|
||||
*/
|
||||
private static void randomize(Object[] arr) {
|
||||
int seed = 1234567; // constant
|
||||
int randomState = 4*seed + 1;
|
||||
// Random random = new Random(seed); // unnecessary overhead
|
||||
int len = arr.length;
|
||||
for (int i=0; i < len-1; i++) {
|
||||
randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
|
||||
int r = randomState % (len-i);
|
||||
if (r < 0) r = -r; // e.g. -9 % 2 == -1
|
||||
// int r = random.nextInt(len-i);
|
||||
|
||||
// swap arr[i, i+r]
|
||||
Object tmp = arr[i];
|
||||
arr[i] = arr[i + r];
|
||||
arr[i + r] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
stack = null;
|
||||
index = 0;
|
||||
current = null;
|
||||
todo = 0;
|
||||
}
|
||||
}
|
|
@ -1,329 +0,0 @@
|
|||
package org.apache.lucene.wordnet;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.PrintStream;
|
||||
import java.io.Reader;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.TieredMergePolicy;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
|
||||
* into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}).
|
||||
*
|
||||
* This has been tested with WordNet 2.0.
|
||||
*
|
||||
* The index has fields named "word" ({@link #F_WORD})
|
||||
* and "syn" ({@link #F_SYN}).
|
||||
* <p>
|
||||
* The source word (such as 'big') can be looked up in the
|
||||
* "word" field, and if present there will be fields named "syn"
|
||||
* for every synonym. What's tricky here is that there could be <b>multiple</b>
|
||||
* fields with the same name, in the general case for words that have multiple synonyms.
|
||||
* That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues}
|
||||
* </p>
|
||||
* <p>
|
||||
* While the WordNet file distinguishes groups of synonyms with
|
||||
* related meanings we don't do that here.
|
||||
* </p>
|
||||
*
|
||||
* This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
|
||||
*
|
||||
* @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
|
||||
* @see <a href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb man page</a>
|
||||
* @see <a href="http://www.hostmon.com/rfc/advanced.jsp">sample site that uses it</a>
|
||||
*/
|
||||
public class Syns2Index
|
||||
{
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final PrintStream o = System.out;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final PrintStream err = System.err;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public static final String F_SYN = "syn";
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public static final String F_WORD = "word";
|
||||
|
||||
/**
|
||||
* we don't actually analyze any text (only a NOT_ANALYZED field),
|
||||
* but analyzer can't be null, docinverter wants the offset gap!
|
||||
*/
|
||||
private static final Analyzer ana = new Analyzer() {
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Takes arg of prolog file name and index directory.
|
||||
*/
|
||||
public static void main(String[] args)
|
||||
throws Throwable
|
||||
{
|
||||
// get command line arguments
|
||||
String prologFilename = null; // name of file "wn_s.pl"
|
||||
String indexDir = null;
|
||||
if (args.length == 2)
|
||||
{
|
||||
prologFilename = args[0];
|
||||
indexDir = args[1];
|
||||
}
|
||||
else
|
||||
{
|
||||
usage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
// ensure that the prolog file is readable
|
||||
if (! (new File(prologFilename)).canRead())
|
||||
{
|
||||
err.println("Error: cannot read Prolog file: " + prologFilename);
|
||||
System.exit(1);
|
||||
}
|
||||
// exit if the target index directory already exists
|
||||
if ((new File(indexDir)).isDirectory())
|
||||
{
|
||||
err.println("Error: index directory already exists: " + indexDir);
|
||||
err.println("Please specify a name of a non-existent directory");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
o.println("Opening Prolog file " + prologFilename);
|
||||
final FileInputStream fis = new FileInputStream(prologFilename);
|
||||
final BufferedReader br = new BufferedReader(new InputStreamReader(fis));
|
||||
String line;
|
||||
|
||||
// maps a word to all the "groups" it's in
|
||||
final Map<String,List<String>> word2Nums = new TreeMap<String,List<String>>();
|
||||
// maps a group to all the words in it
|
||||
final Map<String,List<String>> num2Words = new TreeMap<String,List<String>>();
|
||||
// number of rejected words
|
||||
int ndecent = 0;
|
||||
|
||||
// status output
|
||||
int mod = 1;
|
||||
int row = 1;
|
||||
// parse prolog file
|
||||
o.println( "[1/2] Parsing " + prologFilename);
|
||||
while ((line = br.readLine()) != null)
|
||||
{
|
||||
// occasional progress
|
||||
if ((++row) % mod == 0) // periodically print out line we read in
|
||||
{
|
||||
mod *= 2;
|
||||
o.println("\t" + row + " " + line + " " + word2Nums.size()
|
||||
+ " " + num2Words.size() + " ndecent=" + ndecent);
|
||||
}
|
||||
|
||||
// syntax check
|
||||
if (! line.startsWith("s("))
|
||||
{
|
||||
err.println("OUCH: " + line);
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
// parse line
|
||||
line = line.substring(2);
|
||||
int comma = line.indexOf(',');
|
||||
String num = line.substring(0, comma);
|
||||
int q1 = line.indexOf('\'');
|
||||
line = line.substring(q1 + 1);
|
||||
int q2 = line.lastIndexOf('\'');
|
||||
String word = line.substring(0, q2).toLowerCase().replace("''", "'");
|
||||
|
||||
// make sure is a normal word
|
||||
if (! isDecent(word))
|
||||
{
|
||||
ndecent++;
|
||||
continue; // don't store words w/ spaces
|
||||
}
|
||||
|
||||
// 1/2: word2Nums map
|
||||
// append to entry or add new one
|
||||
List<String> lis = word2Nums.get(word);
|
||||
if (lis == null)
|
||||
{
|
||||
lis = new LinkedList<String>();
|
||||
lis.add(num);
|
||||
word2Nums.put(word, lis);
|
||||
}
|
||||
else
|
||||
lis.add(num);
|
||||
|
||||
// 2/2: num2Words map
|
||||
lis = num2Words.get(num);
|
||||
if (lis == null)
|
||||
{
|
||||
lis = new LinkedList<String>();
|
||||
lis.add(word);
|
||||
num2Words.put(num, lis);
|
||||
}
|
||||
else
|
||||
lis.add(word);
|
||||
}
|
||||
|
||||
// close the streams
|
||||
fis.close();
|
||||
br.close();
|
||||
|
||||
// create the index
|
||||
o.println( "[2/2] Building index to store synonyms, " +
|
||||
" map sizes are " + word2Nums.size() + " and " + num2Words.size());
|
||||
index(indexDir, word2Nums, num2Words);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks to see if a word contains only alphabetic characters by
|
||||
* checking it one character at a time.
|
||||
*
|
||||
* @param s string to check
|
||||
* @return <code>true</code> if the string is decent
|
||||
*/
|
||||
private static boolean isDecent(String s)
|
||||
{
|
||||
int len = s.length();
|
||||
for (int i = 0; i < len; i++)
|
||||
{
|
||||
if (!Character.isLetter(s.charAt(i)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Forms a Lucene index based on the 2 maps.
|
||||
*
|
||||
* @param indexDir the directory where the index should be created
|
||||
* @param word2Nums
|
||||
* @param num2Words
|
||||
*/
|
||||
private static void index(String indexDir, Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words)
|
||||
throws Throwable
|
||||
{
|
||||
int row = 0;
|
||||
int mod = 1;
|
||||
FSDirectory dir = FSDirectory.open(new File(indexDir));
|
||||
try {
|
||||
|
||||
// override the specific index if it already exists
|
||||
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
|
||||
Version.LUCENE_CURRENT, ana).setOpenMode(OpenMode.CREATE));
|
||||
((TieredMergePolicy) writer.getConfig().getMergePolicy()).setUseCompoundFile(true); // why?
|
||||
Iterator<String> i1 = word2Nums.keySet().iterator();
|
||||
while (i1.hasNext()) // for each word
|
||||
{
|
||||
String g = i1.next();
|
||||
Document doc = new Document();
|
||||
|
||||
int n = index(word2Nums, num2Words, g, doc);
|
||||
if (n > 0)
|
||||
{
|
||||
doc.add( new Field( F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
if ((++row % mod) == 0)
|
||||
{
|
||||
o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc);
|
||||
mod *= 2;
|
||||
}
|
||||
writer.addDocument(doc);
|
||||
} // else degenerate
|
||||
}
|
||||
o.println( "Optimizing..");
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
} finally {
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the 2 maps fills a document for 1 word.
|
||||
*/
|
||||
private static int index(Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words, String g, Document doc)
|
||||
throws Throwable
|
||||
{
|
||||
List<String> keys = word2Nums.get(g); // get list of key#'s
|
||||
Iterator<String> i2 = keys.iterator();
|
||||
|
||||
Set<String> already = new TreeSet<String>(); // keep them sorted
|
||||
|
||||
// pass 1: fill up 'already' with all words
|
||||
while (i2.hasNext()) // for each key#
|
||||
{
|
||||
already.addAll(num2Words.get(i2.next())); // get list of words
|
||||
}
|
||||
int num = 0;
|
||||
already.remove(g); // of course a word is it's own syn
|
||||
Iterator<String> it = already.iterator();
|
||||
while (it.hasNext())
|
||||
{
|
||||
String cur = it.next();
|
||||
// don't store things like 'pit bull' -> 'american pit bull'
|
||||
if (!isDecent(cur))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
num++;
|
||||
doc.add( new Field( F_SYN, cur, Field.Store.YES, Field.Index.NO));
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static void usage()
|
||||
{
|
||||
o.println("\n\n" +
|
||||
"java org.apache.lucene.wordnet.Syns2Index <prolog file> <index dir>\n\n");
|
||||
}
|
||||
|
||||
}
|
|
@ -1,57 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<title>WordNet Lucene Synonyms Integration</title>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
This package uses synonyms defined by <a href="http://www.cogsci.princeton.edu/~wn/">WordNet</a>.
|
||||
There are two methods: query expansion and analysis.
|
||||
|
||||
Both methods first require you to download the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog database</a>
|
||||
Inside this archive is a file named wn_s.pl, which contains the WordNet synonyms.
|
||||
|
||||
<h1>Query Expansion Method</h1>
|
||||
This method creates Lucene index storing the synonyms, which in turn can be used for query expansion.
|
||||
|
||||
You normally run {@link org.apache.lucene.wordnet.Syns2Index} once to build the query index/"database", and then call
|
||||
{@link org.apache.lucene.wordnet.SynExpand#expand SynExpand.expand(...)} to expand a query.
|
||||
|
||||
<p>
|
||||
|
||||
<h3> Instructions </h3>
|
||||
<ol>
|
||||
<li> Invoke Syn2Index as appropriate to build a synonym index.
|
||||
It'll take 2 arguments, the path to wn_s.pl from the WordNet download, and the index name.
|
||||
|
||||
<li> Update your UI so that as appropriate you call SynExpand.expand(...) to expand user queries with synonyms.
|
||||
</ol>
|
||||
|
||||
<h1>Analysis Method</h1>
|
||||
This method injects additional synonym tokens for tokens from a child {@link org.apache.lucene.analysis.TokenStream}.
|
||||
|
||||
<h3> Instructions </h3>
|
||||
<ol>
|
||||
<li>Create a {@link org.apache.lucene.wordnet.SynonymMap}, passing in the path to wn_s.pl
|
||||
<li>Add a {@link org.apache.lucene.wordnet.SynonymTokenFilter} to your analyzer. Note: SynonymTokenFilter should be after LowerCaseFilter,
|
||||
because it expects terms to already be in lowercase.
|
||||
</ol>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -1,119 +0,0 @@
|
|||
package org.apache.lucene.wordnet;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
public class TestSynonymTokenFilter extends BaseTokenStreamTestCase {
|
||||
final String testFile = "testSynonyms.txt";
|
||||
|
||||
public void testSynonyms() throws Exception {
|
||||
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
|
||||
/* all expansions */
|
||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
|
||||
assertAnalyzesTo(analyzer, "Lost in the woods",
|
||||
new String[] { "lost", "in", "the", "woods", "forest", "wood" },
|
||||
new int[] { 0, 5, 8, 12, 12, 12 },
|
||||
new int[] { 4, 7, 11, 17, 17, 17 },
|
||||
new int[] { 1, 1, 1, 1, 0, 0 });
|
||||
}
|
||||
|
||||
public void testSynonymsSingleQuote() throws Exception {
|
||||
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
|
||||
/* all expansions */
|
||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
|
||||
assertAnalyzesTo(analyzer, "king",
|
||||
new String[] { "king", "baron" });
|
||||
}
|
||||
|
||||
public void testSynonymsLimitedAmount() throws Exception {
|
||||
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
|
||||
/* limit to one synonym expansion */
|
||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
|
||||
assertAnalyzesTo(analyzer, "Lost in the woods",
|
||||
/* wood comes before forest due to
|
||||
* the input file, not lexicographic order
|
||||
*/
|
||||
new String[] { "lost", "in", "the", "woods", "wood" },
|
||||
new int[] { 0, 5, 8, 12, 12 },
|
||||
new int[] { 4, 7, 11, 17, 17 },
|
||||
new int[] { 1, 1, 1, 1, 0 });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
|
||||
/* limit to one synonym expansion */
|
||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
|
||||
assertAnalyzesToReuse(analyzer, "Lost in the woods",
|
||||
new String[] { "lost", "in", "the", "woods", "wood" },
|
||||
new int[] { 0, 5, 8, 12, 12 },
|
||||
new int[] { 4, 7, 11, 17, 17 },
|
||||
new int[] { 1, 1, 1, 1, 0 });
|
||||
assertAnalyzesToReuse(analyzer, "My wolfish dog went to the forest",
|
||||
new String[] { "my", "wolfish", "ravenous", "dog", "went", "to",
|
||||
"the", "forest", "woods" },
|
||||
new int[] { 0, 3, 3, 11, 15, 20, 23, 27, 27 },
|
||||
new int[] { 2, 10, 10, 14, 19, 22, 26, 33, 33 },
|
||||
new int[] { 1, 1, 0, 1, 1, 1, 1, 1, 0 });
|
||||
}
|
||||
|
||||
private class SynonymWhitespaceAnalyzer extends Analyzer {
|
||||
private SynonymMap synonyms;
|
||||
private int maxSynonyms;
|
||||
|
||||
public SynonymWhitespaceAnalyzer(SynonymMap synonyms, int maxSynonyms) {
|
||||
this.synonyms = synonyms;
|
||||
this.maxSynonyms = maxSynonyms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream ts = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
|
||||
ts = new SynonymTokenFilter(ts, synonyms, maxSynonyms);
|
||||
return ts;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
|
||||
streams.result = new SynonymTokenFilter(streams.source, synonyms, maxSynonyms);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,94 +0,0 @@
|
|||
package org.apache.lucene.wordnet;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestWordnet extends LuceneTestCase {
|
||||
private IndexSearcher searcher;
|
||||
private Directory dir;
|
||||
|
||||
String storePathName = new File(TEMP_DIR,"testLuceneWordnet").getAbsolutePath();
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
// create a temporary synonym index
|
||||
File testFile = getDataFile("testSynonyms.txt");
|
||||
String commandLineArgs[] = { testFile.getAbsolutePath(), storePathName };
|
||||
_TestUtil.rmDir(new File(storePathName));
|
||||
|
||||
try {
|
||||
Syns2Index.main(commandLineArgs);
|
||||
} catch (Throwable t) { throw new RuntimeException(t); }
|
||||
|
||||
dir = newFSDirectory(new File(storePathName));
|
||||
searcher = new IndexSearcher(dir, true);
|
||||
}
|
||||
|
||||
public void testExpansion() throws IOException {
|
||||
assertExpandsTo("woods", new String[] { "woods", "forest", "wood" });
|
||||
}
|
||||
|
||||
public void testExpansionSingleQuote() throws IOException {
|
||||
assertExpandsTo("king", new String[] { "king", "baron" });
|
||||
}
|
||||
|
||||
private void assertExpandsTo(String term, String expected[]) throws IOException {
|
||||
Query expandedQuery = SynExpand.expand(term, searcher, new
|
||||
MockAnalyzer(random), "field", 1F);
|
||||
BooleanQuery expectedQuery = new BooleanQuery();
|
||||
for (String t : expected)
|
||||
expectedQuery.add(new TermQuery(new Term("field", t)),
|
||||
BooleanClause.Occur.SHOULD);
|
||||
assertEquals(expectedQuery, expandedQuery);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
if (searcher != null) {
|
||||
searcher.close();
|
||||
}
|
||||
if (dir != null) {
|
||||
dir.close();
|
||||
}
|
||||
rmDir(storePathName); // delete our temporary synonym index
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private void rmDir(String directory) {
|
||||
File dir = new File(directory);
|
||||
File[] files = dir.listFiles();
|
||||
for (int i = 0; i < files.length; i++) {
|
||||
files[i].delete();
|
||||
}
|
||||
dir.delete();
|
||||
}
|
||||
}
|
|
@ -1,9 +0,0 @@
|
|||
s(100000001,1,'woods',n,1,0).
|
||||
s(100000001,2,'wood',n,1,0).
|
||||
s(100000001,3,'forest',n,1,0).
|
||||
s(100000002,1,'wolfish',n,1,0).
|
||||
s(100000002,2,'ravenous',n,1,0).
|
||||
s(100000003,1,'king',n,1,1).
|
||||
s(100000003,2,'baron',n,1,1).
|
||||
s(100000004,1,'king''sevil',n,1,1).
|
||||
s(100000004,2,'meany',n,1,1).
|
|
@ -95,9 +95,6 @@ public class MemoryCodec extends Codec {
|
|||
this.out = out;
|
||||
this.field = field;
|
||||
builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
// The byte[] output we create can easily be > 255 bytes:
|
||||
builder.setAllowArrayArcs(false);
|
||||
}
|
||||
|
||||
private class PostingsWriter extends PostingsConsumer {
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
package org.apache.lucene.store;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class ByteArrayDataOutput extends DataOutput {
|
||||
private byte[] bytes;
|
||||
|
||||
private int pos;
|
||||
private int limit;
|
||||
|
||||
public ByteArrayDataOutput(byte[] bytes) {
|
||||
reset(bytes);
|
||||
}
|
||||
|
||||
public ByteArrayDataOutput(byte[] bytes, int offset, int len) {
|
||||
reset(bytes, offset, len);
|
||||
}
|
||||
|
||||
public ByteArrayDataOutput() {
|
||||
reset(BytesRef.EMPTY_BYTES);
|
||||
}
|
||||
|
||||
public void reset(byte[] bytes) {
|
||||
reset(bytes, 0, bytes.length);
|
||||
}
|
||||
|
||||
public void reset(byte[] bytes, int offset, int len) {
|
||||
this.bytes = bytes;
|
||||
pos = offset;
|
||||
limit = offset + len;
|
||||
}
|
||||
|
||||
public int getPosition() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeByte(byte b) {
|
||||
assert pos < limit;
|
||||
bytes[pos++] = b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeBytes(byte[] b, int offset, int length) {
|
||||
assert pos + length <= limit;
|
||||
System.arraycopy(b, offset, bytes, pos, length);
|
||||
pos += length;
|
||||
}
|
||||
}
|
|
@ -1,5 +1,7 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -167,7 +169,11 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence {
|
|||
* the {@link CharsRef} to copy
|
||||
*/
|
||||
public void copy(CharsRef other) {
|
||||
if (chars == null) {
|
||||
chars = new char[other.length];
|
||||
} else {
|
||||
chars = ArrayUtil.grow(chars, other.length);
|
||||
}
|
||||
System.arraycopy(other.chars, other.offset, chars, 0, other.length);
|
||||
length = other.length;
|
||||
offset = 0;
|
||||
|
@ -213,4 +219,56 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence {
|
|||
public CharSequence subSequence(int start, int end) {
|
||||
return new CharsRef(chars, offset + start, offset + end - 1);
|
||||
}
|
||||
|
||||
private final static Comparator<CharsRef> utf16SortedAsUTF8SortOrder = new UTF16SortedAsUTF8Comparator();
|
||||
|
||||
public static Comparator<CharsRef> getUTF16SortedAsUTF8Comparator() {
|
||||
return utf16SortedAsUTF8SortOrder;
|
||||
}
|
||||
|
||||
private static class UTF16SortedAsUTF8Comparator implements Comparator<CharsRef> {
|
||||
// Only singleton
|
||||
private UTF16SortedAsUTF8Comparator() {};
|
||||
|
||||
public int compare(CharsRef a, CharsRef b) {
|
||||
if (a == b)
|
||||
return 0;
|
||||
|
||||
final char[] aChars = a.chars;
|
||||
int aUpto = a.offset;
|
||||
final char[] bChars = b.chars;
|
||||
int bUpto = b.offset;
|
||||
|
||||
final int aStop = aUpto + Math.min(a.length, b.length);
|
||||
|
||||
while (aUpto < aStop) {
|
||||
char aChar = aChars[aUpto++];
|
||||
char bChar = bChars[bUpto++];
|
||||
if (aChar != bChar) {
|
||||
// http://icu-project.org/docs/papers/utf16_code_point_order.html
|
||||
|
||||
/* aChar != bChar, fix up each one if they're both in or above the surrogate range, then compare them */
|
||||
if (aChar >= 0xd800 && bChar >= 0xd800) {
|
||||
if (aChar >= 0xe000) {
|
||||
aChar -= 0x800;
|
||||
} else {
|
||||
aChar += 0x2000;
|
||||
}
|
||||
|
||||
if (bChar >= 0xe000) {
|
||||
bChar -= 0x800;
|
||||
} else {
|
||||
bChar += 0x2000;
|
||||
}
|
||||
}
|
||||
|
||||
/* now aChar and bChar are in code point order */
|
||||
return (int)aChar - (int)bChar; /* int must be 32 bits wide */
|
||||
}
|
||||
}
|
||||
|
||||
// One is a prefix of the other, or, they are equal:
|
||||
return a.length - b.length;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -71,7 +71,11 @@ public class FST<T> {
|
|||
// Increment version to change it
|
||||
private final static String FILE_FORMAT_NAME = "FST";
|
||||
private final static int VERSION_START = 0;
|
||||
private final static int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
/** Changed numBytesPerArc for array'd case from byte to int. */
|
||||
private final static int VERSION_INT_NUM_BYTES_PER_ARC = 1;
|
||||
|
||||
private final static int VERSION_CURRENT = VERSION_INT_NUM_BYTES_PER_ARC;
|
||||
|
||||
// Never serialized; just used to represent the virtual
|
||||
// final node w/ no arcs:
|
||||
|
@ -106,6 +110,8 @@ public class FST<T> {
|
|||
|
||||
private boolean allowArrayArcs = true;
|
||||
|
||||
private Arc<T> cachedRootArcs[];
|
||||
|
||||
public final static class Arc<T> {
|
||||
public int label;
|
||||
public T output;
|
||||
|
@ -113,7 +119,7 @@ public class FST<T> {
|
|||
int target;
|
||||
|
||||
byte flags;
|
||||
T nextFinalOutput;
|
||||
public T nextFinalOutput;
|
||||
int nextArc;
|
||||
|
||||
// This is non-zero if current arcs are fixed array:
|
||||
|
@ -176,7 +182,7 @@ public class FST<T> {
|
|||
public FST(DataInput in, Outputs<T> outputs) throws IOException {
|
||||
this.outputs = outputs;
|
||||
writer = null;
|
||||
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_START);
|
||||
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_INT_NUM_BYTES_PER_ARC, VERSION_INT_NUM_BYTES_PER_ARC);
|
||||
if (in.readByte() == 1) {
|
||||
// accepts empty string
|
||||
int numBytes = in.readVInt();
|
||||
|
@ -209,6 +215,8 @@ public class FST<T> {
|
|||
bytes = new byte[in.readVInt()];
|
||||
in.readBytes(bytes, 0, bytes.length);
|
||||
NO_OUTPUT = outputs.getNoOutput();
|
||||
|
||||
cacheRootArcs();
|
||||
}
|
||||
|
||||
public INPUT_TYPE getInputType() {
|
||||
|
@ -220,7 +228,7 @@ public class FST<T> {
|
|||
return bytes.length;
|
||||
}
|
||||
|
||||
void finish(int startNode) {
|
||||
void finish(int startNode) throws IOException {
|
||||
if (startNode == FINAL_END_NODE && emptyOutput != null) {
|
||||
startNode = 0;
|
||||
}
|
||||
|
@ -231,6 +239,32 @@ public class FST<T> {
|
|||
System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite);
|
||||
bytes = finalBytes;
|
||||
this.startNode = startNode;
|
||||
|
||||
cacheRootArcs();
|
||||
}
|
||||
|
||||
// Caches first 128 labels
|
||||
@SuppressWarnings("unchecked")
|
||||
private void cacheRootArcs() throws IOException {
|
||||
cachedRootArcs = (FST.Arc<T>[]) new FST.Arc[0x80];
|
||||
final FST.Arc<T> arc = new FST.Arc<T>();
|
||||
getFirstArc(arc);
|
||||
final BytesReader in = getBytesReader(0);
|
||||
if (targetHasArcs(arc)) {
|
||||
readFirstRealArc(arc.target, arc);
|
||||
while(true) {
|
||||
assert arc.label != END_LABEL;
|
||||
if (arc.label < cachedRootArcs.length) {
|
||||
cachedRootArcs[arc.label] = new Arc<T>().copyFrom(arc);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
if (arc.isLast()) {
|
||||
break;
|
||||
}
|
||||
readNextRealArc(arc, in);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void setEmptyOutput(T v) throws IOException {
|
||||
|
@ -345,8 +379,9 @@ public class FST<T> {
|
|||
writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY);
|
||||
writer.writeVInt(node.numArcs);
|
||||
// placeholder -- we'll come back and write the number
|
||||
// of bytes per arc here:
|
||||
writer.writeByte((byte) 0);
|
||||
// of bytes per arc (int) here:
|
||||
// TODO: we could make this a vInt instead
|
||||
writer.writeInt(0);
|
||||
fixedArrayStart = writer.posWrite;
|
||||
//System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart);
|
||||
} else {
|
||||
|
@ -421,15 +456,21 @@ public class FST<T> {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: if arc'd arrays will be "too wasteful" by some
|
||||
// measure, eg if arcs have vastly different sized
|
||||
// outputs, then we should selectively disable array for
|
||||
// such cases
|
||||
|
||||
if (doFixedArray) {
|
||||
assert maxBytesPerArc > 0;
|
||||
// 2nd pass just "expands" all arcs to take up a fixed
|
||||
// byte size
|
||||
final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc;
|
||||
bytes = ArrayUtil.grow(bytes, sizeNeeded);
|
||||
if (maxBytesPerArc > 255) {
|
||||
throw new IllegalStateException("max arc size is too large (" + maxBytesPerArc + "); disable array arcs by calling Builder.setAllowArrayArcs(false)");
|
||||
}
|
||||
// TODO: we could make this a vInt instead
|
||||
bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24);
|
||||
bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16);
|
||||
bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8);
|
||||
bytes[fixedArrayStart-1] = (byte) maxBytesPerArc;
|
||||
|
||||
// expand the arcs in place, backwards
|
||||
|
@ -502,7 +543,7 @@ public class FST<T> {
|
|||
if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
|
||||
// array: jump straight to end
|
||||
arc.numArcs = in.readVInt();
|
||||
arc.bytesPerArc = in.readByte() & 0xFF;
|
||||
arc.bytesPerArc = in.readInt();
|
||||
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
|
||||
arc.posArcsStart = in.pos;
|
||||
arc.arcIdx = arc.numArcs - 2;
|
||||
|
@ -528,7 +569,7 @@ public class FST<T> {
|
|||
}
|
||||
arc.nextArc = in.pos+1;
|
||||
}
|
||||
readNextRealArc(arc);
|
||||
readNextRealArc(arc, in);
|
||||
assert arc.isLast();
|
||||
return arc;
|
||||
}
|
||||
|
@ -572,7 +613,7 @@ public class FST<T> {
|
|||
//System.out.println(" fixedArray");
|
||||
// this is first arc in a fixed-array
|
||||
arc.numArcs = in.readVInt();
|
||||
arc.bytesPerArc = in.readByte() & 0xFF;
|
||||
arc.bytesPerArc = in.readInt();
|
||||
arc.arcIdx = -1;
|
||||
arc.nextArc = arc.posArcsStart = in.pos;
|
||||
//System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
|
||||
|
@ -580,7 +621,7 @@ public class FST<T> {
|
|||
arc.nextArc = address;
|
||||
arc.bytesPerArc = 0;
|
||||
}
|
||||
return readNextRealArc(arc);
|
||||
return readNextRealArc(arc, in);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -609,7 +650,7 @@ public class FST<T> {
|
|||
}
|
||||
return readFirstRealArc(arc.nextArc, arc);
|
||||
} else {
|
||||
return readNextRealArc(arc);
|
||||
return readNextRealArc(arc, getBytesReader(0));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -627,7 +668,7 @@ public class FST<T> {
|
|||
//System.out.println(" nextArc fake array");
|
||||
in.pos--;
|
||||
in.readVInt();
|
||||
in.readByte();
|
||||
in.readInt();
|
||||
}
|
||||
} else {
|
||||
if (arc.bytesPerArc != 0) {
|
||||
|
@ -645,17 +686,16 @@ public class FST<T> {
|
|||
return readLabel(in);
|
||||
}
|
||||
|
||||
Arc<T> readNextRealArc(Arc<T> arc) throws IOException {
|
||||
Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
|
||||
// this is a continuing arc in a fixed array
|
||||
final BytesReader in;
|
||||
if (arc.bytesPerArc != 0) {
|
||||
// arcs are at fixed entries
|
||||
arc.arcIdx++;
|
||||
assert arc.arcIdx < arc.numArcs;
|
||||
in = getBytesReader(arc.posArcsStart - arc.arcIdx*arc.bytesPerArc);
|
||||
in.pos = arc.posArcsStart - arc.arcIdx*arc.bytesPerArc;
|
||||
} else {
|
||||
// arcs are packed
|
||||
in = getBytesReader(arc.nextArc);
|
||||
in.pos = arc.nextArc;
|
||||
}
|
||||
arc.flags = in.readByte();
|
||||
arc.label = readLabel(in);
|
||||
|
@ -701,6 +741,17 @@ public class FST<T> {
|
|||
/** Finds an arc leaving the incoming arc, replacing the arc in place.
|
||||
* This returns null if the arc was not found, else the incoming arc. */
|
||||
public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc) throws IOException {
|
||||
assert cachedRootArcs != null;
|
||||
// Short-circuit if this arc is in the root arc cache:
|
||||
if (follow.target == startNode && labelToMatch != END_LABEL && labelToMatch < cachedRootArcs.length) {
|
||||
final Arc<T> result = cachedRootArcs[labelToMatch];
|
||||
if (result == null) {
|
||||
return result;
|
||||
} else {
|
||||
arc.copyFrom(result);
|
||||
return arc;
|
||||
}
|
||||
}
|
||||
|
||||
if (labelToMatch == END_LABEL) {
|
||||
if (follow.isFinal()) {
|
||||
|
@ -726,14 +777,18 @@ public class FST<T> {
|
|||
// reusable stuff eg BytesReader:
|
||||
final BytesReader in = getBytesReader(follow.target);
|
||||
|
||||
// System.out.println("fta label=" + (char) labelToMatch);
|
||||
|
||||
if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) {
|
||||
// Arcs are full array; do binary search:
|
||||
arc.numArcs = in.readVInt();
|
||||
arc.bytesPerArc = in.readByte() & 0xFF;
|
||||
//System.out.println(" bs " + arc.numArcs);
|
||||
arc.bytesPerArc = in.readInt();
|
||||
arc.posArcsStart = in.pos;
|
||||
int low = 0;
|
||||
int high = arc.numArcs-1;
|
||||
while (low <= high) {
|
||||
//System.out.println(" cycle");
|
||||
int mid = (low + high) >>> 1;
|
||||
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
|
||||
int midLabel = readLabel(in);
|
||||
|
@ -744,7 +799,8 @@ public class FST<T> {
|
|||
high = mid - 1;
|
||||
else {
|
||||
arc.arcIdx = mid-1;
|
||||
return readNextRealArc(arc);
|
||||
//System.out.println(" found!");
|
||||
return readNextRealArc(arc, in);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -754,7 +810,12 @@ public class FST<T> {
|
|||
// Linear scan
|
||||
readFirstTargetArc(follow, arc);
|
||||
while(true) {
|
||||
//System.out.println(" non-bs cycle");
|
||||
// TODO: we should fix this code to not have to create
|
||||
// object for the output of every arc we scan... only
|
||||
// for the matching arc, if found
|
||||
if (arc.label == labelToMatch) {
|
||||
//System.out.println(" found!");
|
||||
return arc;
|
||||
} else if (arc.label > labelToMatch) {
|
||||
return null;
|
||||
|
@ -863,7 +924,7 @@ public class FST<T> {
|
|||
}
|
||||
|
||||
// Non-static: reads byte[] from FST
|
||||
class BytesReader extends DataInput {
|
||||
final class BytesReader extends DataInput {
|
||||
int pos;
|
||||
|
||||
public BytesReader(int pos) {
|
||||
|
|
|
@ -170,7 +170,7 @@ abstract class FSTEnum<T> {
|
|||
if (found) {
|
||||
// Match
|
||||
arc.arcIdx = mid-1;
|
||||
fst.readNextRealArc(arc);
|
||||
fst.readNextRealArc(arc, in);
|
||||
assert arc.arcIdx == mid;
|
||||
assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
|
||||
output[upto] = fst.outputs.add(output[upto-1], arc.output);
|
||||
|
@ -185,7 +185,7 @@ abstract class FSTEnum<T> {
|
|||
} else if (low == arc.numArcs) {
|
||||
// Dead end
|
||||
arc.arcIdx = arc.numArcs-2;
|
||||
fst.readNextRealArc(arc);
|
||||
fst.readNextRealArc(arc, in);
|
||||
assert arc.isLast();
|
||||
// Dead end (target is after the last arc);
|
||||
// rollback to last fork then push
|
||||
|
@ -205,7 +205,7 @@ abstract class FSTEnum<T> {
|
|||
}
|
||||
} else {
|
||||
arc.arcIdx = (low > high ? low : high)-1;
|
||||
fst.readNextRealArc(arc);
|
||||
fst.readNextRealArc(arc, in);
|
||||
assert arc.label > targetLabel;
|
||||
pushFirst();
|
||||
return;
|
||||
|
@ -309,7 +309,7 @@ abstract class FSTEnum<T> {
|
|||
// Match -- recurse
|
||||
//System.out.println(" match! arcIdx=" + mid);
|
||||
arc.arcIdx = mid-1;
|
||||
fst.readNextRealArc(arc);
|
||||
fst.readNextRealArc(arc, in);
|
||||
assert arc.arcIdx == mid;
|
||||
assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
|
||||
output[upto] = fst.outputs.add(output[upto-1], arc.output);
|
||||
|
@ -352,7 +352,7 @@ abstract class FSTEnum<T> {
|
|||
// There is a floor arc:
|
||||
arc.arcIdx = (low > high ? high : low)-1;
|
||||
//System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1));
|
||||
fst.readNextRealArc(arc);
|
||||
fst.readNextRealArc(arc, in);
|
||||
assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel;
|
||||
assert arc.label < targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel;
|
||||
pushLast();
|
||||
|
|
|
@ -35,6 +35,7 @@ final class NodeHash<T> {
|
|||
}
|
||||
|
||||
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address) throws IOException {
|
||||
final FST<T>.BytesReader in = fst.getBytesReader(0);
|
||||
fst.readFirstRealArc(address, scratchArc);
|
||||
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
|
||||
return false;
|
||||
|
@ -56,7 +57,7 @@ final class NodeHash<T> {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
fst.readNextRealArc(scratchArc);
|
||||
fst.readNextRealArc(scratchArc, in);
|
||||
}
|
||||
|
||||
return false;
|
||||
|
@ -87,6 +88,7 @@ final class NodeHash<T> {
|
|||
// hash code for a frozen node
|
||||
private int hash(int node) throws IOException {
|
||||
final int PRIME = 31;
|
||||
final FST<T>.BytesReader in = fst.getBytesReader(0);
|
||||
//System.out.println("hash frozen");
|
||||
int h = 0;
|
||||
fst.readFirstRealArc(node, scratchArc);
|
||||
|
@ -102,7 +104,7 @@ final class NodeHash<T> {
|
|||
if (scratchArc.isLast()) {
|
||||
break;
|
||||
}
|
||||
fst.readNextRealArc(scratchArc);
|
||||
fst.readNextRealArc(scratchArc, in);
|
||||
}
|
||||
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
|
||||
return h & Integer.MAX_VALUE;
|
||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -359,12 +356,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
</ul>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a> ___________________ <em>javadoc-contrib-wordnet</em>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a> ___________________ <em>javadoc-contrib-xml-query-parser</em>
|
||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="../api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="../api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="../api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -263,9 +260,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="#spellchecker">spellchecker</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#wordnet">wordnet</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#xml-query-parser">xml-query-parser</a>
|
||||
</li>
|
||||
</ul>
|
||||
|
@ -375,12 +369,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
<p>Provides tools for spellchecking and suggestions with Lucene.</p>
|
||||
<p>See <a href="../api/contrib-spellchecker/index.html">spellchecker javadoc</a>
|
||||
</p>
|
||||
<a name="N100DE"></a><a name="wordnet"></a>
|
||||
<h3 class="boxed">wordnet</h3>
|
||||
<p>Tools to help utilize wordnet synonyms with Lucene</p>
|
||||
<p>See <a href="../api/contrib-wordnet/index.html">wordnet javadoc</a>
|
||||
</p>
|
||||
<a name="N100ED"></a><a name="xml-query-parser"></a>
|
||||
<a name="N100DE"></a><a name="xml-query-parser"></a>
|
||||
<h3 class="boxed">xml-query-parser</h3>
|
||||
<p>A QueryParser that can read queries written in an XML format.</p>
|
||||
<p>See <a href="../api/contrib-wordnet/index.html">xml-query-parser javadoc</a>
|
||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -106,11 +106,6 @@
|
|||
<p>See <a href="../api/contrib-spellchecker/index.html">spellchecker javadoc</a></p>
|
||||
</section>
|
||||
|
||||
<section id="wordnet"><title>wordnet</title>
|
||||
<p>Tools to help utilize wordnet synonyms with Lucene</p>
|
||||
<p>See <a href="../api/contrib-wordnet/index.html">wordnet javadoc</a></p>
|
||||
</section>
|
||||
|
||||
<section id="xml-query-parser"><title>xml-query-parser</title>
|
||||
<p>A QueryParser that can read queries written in an XML format.</p>
|
||||
<p>See <a href="../api/contrib-wordnet/index.html">xml-query-parser javadoc</a></p>
|
||||
|
|
|
@ -66,7 +66,6 @@ See http://forrest.apache.org/docs/linking.html for more info
|
|||
<javadoc-contrib-remote label="Remote" href="ext:javadocs-contrib-remote"/>
|
||||
<javadoc-contrib-spatial label="Spatial" href="ext:javadocs-contrib-spatial"/>
|
||||
<javadoc-contrib-spellchecker label="Spellchecker" href="ext:javadocs-contrib-spellchecker"/>
|
||||
<javadoc-contrib-wordnet label="Wordnet" href="ext:javadocs-contrib-wordnet"/>
|
||||
<javadoc-contrib-xml-query-parser label="XML Query Parser" href="ext:javadocs-contrib-xml-query-parser"/>
|
||||
</javadoc-contrib>
|
||||
</javadoc>
|
||||
|
@ -106,7 +105,6 @@ See http://forrest.apache.org/docs/linking.html for more info
|
|||
<javadocs-contrib-remote href="api/contrib-remote/index.html"/>
|
||||
<javadocs-contrib-spatial href="api/contrib-spatial/index.html"/>
|
||||
<javadocs-contrib-spellchecker href="api/contrib-spellchecker/index.html"/>
|
||||
<javadocs-contrib-wordnet href="api/contrib-wordnet/index.html"/>
|
||||
<javadocs-contrib-xml-query-parser href="api/contrib-xml-query-parser/index.html"/>
|
||||
|
||||
<forrest href="http://forrest.apache.org/">
|
||||
|
|
|
@ -261,6 +261,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
text = _TestUtil.randomUnicodeString(random, maxWordLength);
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
}
|
||||
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text));
|
||||
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
|
||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||
|
@ -286,6 +290,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
ts.close();
|
||||
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
||||
if (!tokens.isEmpty()) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
|
||||
}
|
||||
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos + type
|
||||
assertAnalyzesToReuse(a, text,
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
|
@ -166,6 +167,13 @@ public class TestIndexWriterCommit extends LuceneTestCase {
|
|||
* measure max temp disk space used.
|
||||
*/
|
||||
public void testCommitOnCloseDiskUsage() throws IOException {
|
||||
// MemoryCodec, since it uses FST, is not necessarily
|
||||
// "additive", ie if you add up N small FSTs, then merge
|
||||
// them, the merged result can easily be larger than the
|
||||
// sum because the merged FST may use array encoding for
|
||||
// some arcs (which uses more space):
|
||||
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("id").equals("Memory"));
|
||||
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("content").equals("Memory"));
|
||||
MockDirectoryWrapper dir = newDirectory();
|
||||
Analyzer analyzer;
|
||||
if (random.nextBoolean()) {
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
|
@ -142,6 +143,14 @@ public class TestIndexWriterOnDiskFull extends LuceneTestCase {
|
|||
*/
|
||||
public void testAddIndexOnDiskFull() throws IOException
|
||||
{
|
||||
// MemoryCodec, since it uses FST, is not necessarily
|
||||
// "additive", ie if you add up N small FSTs, then merge
|
||||
// them, the merged result can easily be larger than the
|
||||
// sum because the merged FST may use array encoding for
|
||||
// some arcs (which uses more space):
|
||||
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("id").equals("Memory"));
|
||||
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("content").equals("Memory"));
|
||||
|
||||
int START_COUNT = 57;
|
||||
int NUM_DIR = TEST_NIGHTLY ? 50 : 5;
|
||||
int END_COUNT = START_COUNT + NUM_DIR* (TEST_NIGHTLY ? 25 : 5);
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestCharsRef extends LuceneTestCase {
|
||||
public void testUTF16InUTF8Order() {
|
||||
final int numStrings = atLeast(1000);
|
||||
BytesRef utf8[] = new BytesRef[numStrings];
|
||||
CharsRef utf16[] = new CharsRef[numStrings];
|
||||
|
||||
for (int i = 0; i < numStrings; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(random);
|
||||
utf8[i] = new BytesRef(s);
|
||||
utf16[i] = new CharsRef(s);
|
||||
}
|
||||
|
||||
Arrays.sort(utf8);
|
||||
Arrays.sort(utf16, CharsRef.getUTF16SortedAsUTF8Comparator());
|
||||
|
||||
for (int i = 0; i < numStrings; i++) {
|
||||
assertEquals(utf8[i].utf8ToString(), utf16[i].toString());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,179 @@
|
|||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Reader;
|
||||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* Parser for the Solr synonyms format.
|
||||
* <ol>
|
||||
* <li> Blank lines and lines starting with '#' are comments.
|
||||
* <li> Explicit mappings match any token sequence on the LHS of "=>"
|
||||
* and replace with all alternatives on the RHS. These types of mappings
|
||||
* ignore the expand parameter in the constructor.
|
||||
* Example:
|
||||
* <blockquote>i-pod, i pod => ipod</blockquote>
|
||||
* <li> Equivalent synonyms may be separated with commas and give
|
||||
* no explicit mapping. In this case the mapping behavior will
|
||||
* be taken from the expand parameter in the constructor. This allows
|
||||
* the same synonym file to be used in different synonym handling strategies.
|
||||
* Example:
|
||||
* <blockquote>ipod, i-pod, i pod</blockquote>
|
||||
*
|
||||
* <li> Multiple synonym mapping entries are merged.
|
||||
* Example:
|
||||
* <blockquote>
|
||||
* foo => foo bar<br>
|
||||
* foo => baz<br><br>
|
||||
* is equivalent to<br><br>
|
||||
* foo => foo bar, baz
|
||||
* </blockquote>
|
||||
* </ol>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SolrSynonymParser extends SynonymMap.Builder {
|
||||
private final boolean expand;
|
||||
private final Analyzer analyzer;
|
||||
|
||||
public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
|
||||
super(dedup);
|
||||
this.expand = expand;
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
public void add(Reader in) throws IOException, ParseException {
|
||||
LineNumberReader br = new LineNumberReader(in);
|
||||
try {
|
||||
addInternal(br);
|
||||
} catch (IllegalArgumentException e) {
|
||||
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
|
||||
ex.initCause(e);
|
||||
throw ex;
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void addInternal(BufferedReader in) throws IOException {
|
||||
String line = null;
|
||||
while ((line = in.readLine()) != null) {
|
||||
if (line.length() == 0 || line.charAt(0) == '#') {
|
||||
continue; // ignore empty lines and comments
|
||||
}
|
||||
|
||||
CharsRef inputs[];
|
||||
CharsRef outputs[];
|
||||
|
||||
// TODO: we could process this more efficiently.
|
||||
String sides[] = split(line, "=>");
|
||||
if (sides.length > 1) { // explicit mapping
|
||||
if (sides.length != 2) {
|
||||
throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
|
||||
}
|
||||
String inputStrings[] = split(sides[0], ",");
|
||||
inputs = new CharsRef[inputStrings.length];
|
||||
for (int i = 0; i < inputs.length; i++) {
|
||||
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
|
||||
}
|
||||
|
||||
String outputStrings[] = split(sides[1], ",");
|
||||
outputs = new CharsRef[outputStrings.length];
|
||||
for (int i = 0; i < outputs.length; i++) {
|
||||
outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
|
||||
}
|
||||
} else {
|
||||
String inputStrings[] = split(line, ",");
|
||||
inputs = new CharsRef[inputStrings.length];
|
||||
for (int i = 0; i < inputs.length; i++) {
|
||||
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
|
||||
}
|
||||
if (expand) {
|
||||
outputs = inputs;
|
||||
} else {
|
||||
outputs = new CharsRef[1];
|
||||
outputs[0] = inputs[0];
|
||||
}
|
||||
}
|
||||
|
||||
// currently we include the term itself in the map,
|
||||
// and use includeOrig = false always.
|
||||
// this is how the existing filter does it, but its actually a bug,
|
||||
// especially if combined with ignoreCase = true
|
||||
for (int i = 0; i < inputs.length; i++) {
|
||||
for (int j = 0; j < outputs.length; j++) {
|
||||
add(inputs[i], outputs[j], false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static String[] split(String s, String separator) {
|
||||
ArrayList<String> list = new ArrayList<String>(2);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int pos=0, end=s.length();
|
||||
while (pos < end) {
|
||||
if (s.startsWith(separator,pos)) {
|
||||
if (sb.length() > 0) {
|
||||
list.add(sb.toString());
|
||||
sb=new StringBuilder();
|
||||
}
|
||||
pos+=separator.length();
|
||||
continue;
|
||||
}
|
||||
|
||||
char ch = s.charAt(pos++);
|
||||
if (ch=='\\') {
|
||||
sb.append(ch);
|
||||
if (pos>=end) break; // ERROR, or let it go?
|
||||
ch = s.charAt(pos++);
|
||||
}
|
||||
|
||||
sb.append(ch);
|
||||
}
|
||||
|
||||
if (sb.length() > 0) {
|
||||
list.add(sb.toString());
|
||||
}
|
||||
|
||||
return list.toArray(new String[list.size()]);
|
||||
}
|
||||
|
||||
private String unescape(String s) {
|
||||
if (s.indexOf("\\") >= 0) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
char ch = s.charAt(i);
|
||||
if (ch == '\\' && i < s.length() - 1) {
|
||||
sb.append(s.charAt(++i));
|
||||
} else {
|
||||
sb.append(ch);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
}
|
|
@ -1,3 +1,5 @@
|
|||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -15,245 +17,550 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.synonym;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
|
||||
* <p>
|
||||
* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
|
||||
* or discarded. If the original tokens are included, the position increments may be modified
|
||||
* to retain absolute positions after merging with the synonym tokenstream.
|
||||
* <p>
|
||||
* Generated synonyms will start at the same position as the first matched source token.
|
||||
/**
|
||||
* Matches single or multi word synonyms in a token stream.
|
||||
* This token stream cannot properly handle position
|
||||
* increments != 1, ie, you should place this filter before
|
||||
* filtering out stop words.
|
||||
*
|
||||
* <p>Note that with the current implementation, parsing is
|
||||
* greedy, so whenever multiple parses would apply, the rule
|
||||
* starting the earliest and parsing the most tokens wins.
|
||||
* For example if you have these rules:
|
||||
*
|
||||
* <pre>
|
||||
* a -> x
|
||||
* a b -> y
|
||||
* b c d -> z
|
||||
* </pre>
|
||||
*
|
||||
* Then input <code>a b c d e</code> parses to <code>y b c
|
||||
* d</code>, ie the 2nd rule "wins" because it started
|
||||
* earliest and matched the most input tokens of other rules
|
||||
* starting at that point.</p>
|
||||
*
|
||||
* <p>A future improvement to this filter could allow
|
||||
* non-greedy parsing, such that the 3rd rule would win, and
|
||||
* also separately allow multiple parses, such that all 3
|
||||
* rules would match, perhaps even on a rule by rule
|
||||
* basis.</p>
|
||||
*
|
||||
* <p><b>NOTE</b>: when a match occurs, the output tokens
|
||||
* associated with the matching rule are "stacked" on top of
|
||||
* the input stream (if the rule had
|
||||
* <code>keepOrig=true</code>) and also on top of aother
|
||||
* matched rule's output tokens. This is not a correct
|
||||
* solution, as really the output should be an abitrary
|
||||
* graph/lattice. For example, with the above match, you
|
||||
* would expect an exact <code>PhraseQuery</code> <code>"y b
|
||||
* c"</code> to match the parsed tokens, but it will fail to
|
||||
* do so. This limitations is necessary because Lucene's
|
||||
* TokenStream (and index) cannot yet represent an arbitrary
|
||||
* graph.</p>
|
||||
*
|
||||
* <p><b>NOTE</b>: If multiple incoming tokens arrive on the
|
||||
* same position, only the first token at that position is
|
||||
* used for parsing. Subsequent tokens simply pass through
|
||||
* and are not parsed. A future improvement would be to
|
||||
* allow these tokens to also be matched.</p>
|
||||
*/
|
||||
|
||||
// TODO: maybe we should resolve token -> wordID then run
|
||||
// FST on wordIDs, for better perf?
|
||||
|
||||
// TODO: a more efficient approach would be Aho/Corasick's
|
||||
// algorithm
|
||||
// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
|
||||
// It improves over the current approach here
|
||||
// because it does not fully re-start matching at every
|
||||
// token. For exampl,e if one pattern is "a b c x"
|
||||
// and another is "b c d" and the input is "a b c d", on
|
||||
// trying to parse "a b c x" but failing when you got to x,
|
||||
// rather than starting over again your really should
|
||||
// immediately recognize that "b c d" matches at the next
|
||||
// input. I suspect this won't matter that much in
|
||||
// practice, but it's possible on some set of synonyms it
|
||||
// will. We'd have to modify Aho/Corasick to enforce our
|
||||
// conflict resolving (eg greedy matching) because that algo
|
||||
// finds all matches.
|
||||
|
||||
public final class SynonymFilter extends TokenFilter {
|
||||
|
||||
private final SynonymMap map; // Map<String, SynonymMap>
|
||||
private Iterator<AttributeSource> replacement; // iterator over generated tokens
|
||||
public static final String TYPE_SYNONYM = "SYNONYM";
|
||||
|
||||
public SynonymFilter(TokenStream in, SynonymMap map) {
|
||||
super(in);
|
||||
if (map == null)
|
||||
throw new IllegalArgumentException("map is required");
|
||||
private final SynonymMap synonyms;
|
||||
|
||||
this.map = map;
|
||||
// just ensuring these attributes exist...
|
||||
addAttribute(CharTermAttribute.class);
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
addAttribute(OffsetAttribute.class);
|
||||
addAttribute(TypeAttribute.class);
|
||||
private final boolean ignoreCase;
|
||||
private final int rollBufferSize;
|
||||
|
||||
private int captureCount;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
// How many future input tokens have already been matched
|
||||
// to a synonym; because the matching is "greedy" we don't
|
||||
// try to do any more matching for such tokens:
|
||||
private int inputSkipCount;
|
||||
|
||||
// Hold all buffered (read ahead) stacked input tokens for
|
||||
// a future position. When multiple tokens are at the
|
||||
// same position, we only store (and match against) the
|
||||
// term for the first token at the position, but capture
|
||||
// state for (and enumerate) all other tokens at this
|
||||
// position:
|
||||
private static class PendingInput {
|
||||
final CharsRef term = new CharsRef();
|
||||
AttributeSource.State state;
|
||||
boolean keepOrig;
|
||||
boolean consumed = true;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
|
||||
public void reset() {
|
||||
state = null;
|
||||
consumed = true;
|
||||
keepOrig = false;
|
||||
}
|
||||
};
|
||||
|
||||
// Rolling buffer, holding pending input tokens we had to
|
||||
// clone because we needed to look ahead, indexed by
|
||||
// position:
|
||||
private final PendingInput[] futureInputs;
|
||||
|
||||
// Holds pending output synonyms for one future position:
|
||||
private static class PendingOutputs {
|
||||
CharsRef[] outputs;
|
||||
int upto;
|
||||
int count;
|
||||
int posIncr = 1;
|
||||
|
||||
public PendingOutputs() {
|
||||
outputs = new CharsRef[1];
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Need to worry about multiple scenarios:
|
||||
* - need to go for the longest match
|
||||
* a b => foo #shouldn't match if "a b" is followed by "c d"
|
||||
* a b c d => bar
|
||||
* - need to backtrack - retry matches for tokens already read
|
||||
* a b c d => foo
|
||||
* b c => bar
|
||||
* If the input stream is "a b c x", one will consume "a b c d"
|
||||
* trying to match the first rule... all but "a" should be
|
||||
* pushed back so a match may be made on "b c".
|
||||
* - don't try and match generated tokens (thus need separate queue)
|
||||
* matching is not recursive.
|
||||
* - handle optional generation of original tokens in all these cases,
|
||||
* merging token streams to preserve token positions.
|
||||
* - preserve original positionIncrement of first matched token
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
// if there are any generated tokens, return them... don't try any
|
||||
// matches against them, as we specifically don't want recursion.
|
||||
if (replacement!=null && replacement.hasNext()) {
|
||||
copy(this, replacement.next());
|
||||
return true;
|
||||
public void reset() {
|
||||
upto = count = 0;
|
||||
posIncr = 1;
|
||||
}
|
||||
|
||||
// common case fast-path of first token not matching anything
|
||||
AttributeSource firstTok = nextTok();
|
||||
if (firstTok == null) return false;
|
||||
CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
|
||||
SynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
|
||||
if (result == null) {
|
||||
copy(this, firstTok);
|
||||
return true;
|
||||
public CharsRef pullNext() {
|
||||
assert upto < count;
|
||||
final CharsRef result = outputs[upto++];
|
||||
posIncr = 0;
|
||||
if (upto == count) {
|
||||
reset();
|
||||
}
|
||||
|
||||
// fast-path failed, clone ourselves if needed
|
||||
if (firstTok == this)
|
||||
firstTok = cloneAttributes();
|
||||
// OK, we matched a token, so find the longest match.
|
||||
|
||||
matched = new LinkedList<AttributeSource>();
|
||||
|
||||
result = match(result);
|
||||
|
||||
if (result==null) {
|
||||
// no match, simply return the first token read.
|
||||
copy(this, firstTok);
|
||||
return true;
|
||||
}
|
||||
|
||||
// reuse, or create new one each time?
|
||||
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
|
||||
|
||||
//
|
||||
// there was a match... let's generate the new tokens, merging
|
||||
// in the matched tokens (position increments need adjusting)
|
||||
//
|
||||
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
|
||||
boolean includeOrig = result.includeOrig();
|
||||
|
||||
AttributeSource origTok = includeOrig ? firstTok : null;
|
||||
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
|
||||
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
|
||||
int repPos=0; // curr position in replacement token stream
|
||||
int pos=0; // current position in merged token stream
|
||||
|
||||
for (int i=0; i<result.synonyms.length; i++) {
|
||||
Token repTok = result.synonyms[i];
|
||||
AttributeSource newTok = firstTok.cloneAttributes();
|
||||
CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
|
||||
|
||||
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
||||
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
|
||||
repPos += repTok.getPositionIncrement();
|
||||
if (i==0) repPos=origPos; // make position of first token equal to original
|
||||
|
||||
// if necessary, insert original tokens and adjust position increment
|
||||
while (origTok != null && origPos <= repPos) {
|
||||
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc.setPositionIncrement(origPos-pos);
|
||||
generated.add(origTok);
|
||||
pos += origPosInc.getPositionIncrement();
|
||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||
if (origTok != null) {
|
||||
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPos += origPosInc.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
|
||||
newPosIncAtt.setPositionIncrement(repPos - pos);
|
||||
generated.add(newTok);
|
||||
pos += newPosIncAtt.getPositionIncrement();
|
||||
}
|
||||
|
||||
// finish up any leftover original tokens
|
||||
while (origTok!=null) {
|
||||
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc.setPositionIncrement(origPos-pos);
|
||||
generated.add(origTok);
|
||||
pos += origPosInc.getPositionIncrement();
|
||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||
if (origTok != null) {
|
||||
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPos += origPosInc.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
|
||||
// what if we replaced a longer sequence with a shorter one?
|
||||
// a/0 b/5 => foo/0
|
||||
// should I re-create the gap on the next buffered token?
|
||||
|
||||
replacement = generated.iterator();
|
||||
// Now return to the top of the loop to read and return the first
|
||||
// generated token.. The reason this is done is that we may have generated
|
||||
// nothing at all, and may need to continue with more matching logic.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Defer creation of the buffer until the first time it is used to
|
||||
// optimize short fields with no matches.
|
||||
//
|
||||
private LinkedList<AttributeSource> buffer;
|
||||
private LinkedList<AttributeSource> matched;
|
||||
|
||||
private boolean exhausted;
|
||||
|
||||
private AttributeSource nextTok() throws IOException {
|
||||
if (buffer!=null && !buffer.isEmpty()) {
|
||||
return buffer.removeFirst();
|
||||
} else {
|
||||
if (!exhausted && input.incrementToken()) {
|
||||
return this;
|
||||
} else {
|
||||
exhausted = true;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void pushTok(AttributeSource t) {
|
||||
if (buffer==null) buffer=new LinkedList<AttributeSource>();
|
||||
buffer.addFirst(t);
|
||||
}
|
||||
|
||||
private SynonymMap match(SynonymMap map) throws IOException {
|
||||
SynonymMap result = null;
|
||||
|
||||
if (map.submap != null) {
|
||||
AttributeSource tok = nextTok();
|
||||
if (tok != null) {
|
||||
// clone ourselves.
|
||||
if (tok == this)
|
||||
tok = cloneAttributes();
|
||||
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
|
||||
CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
|
||||
SynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
|
||||
|
||||
if (subMap != null) {
|
||||
// recurse
|
||||
result = match(subMap);
|
||||
}
|
||||
|
||||
if (result != null) {
|
||||
matched.addFirst(tok);
|
||||
} else {
|
||||
// push back unmatched token
|
||||
pushTok(tok);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if no longer sequence matched, so if this node has synonyms, it's the match.
|
||||
if (result==null && map.synonyms!=null) {
|
||||
result = map;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private void copy(AttributeSource target, AttributeSource source) {
|
||||
if (target != source)
|
||||
source.copyTo(target);
|
||||
public void add(char[] output, int offset, int len) {
|
||||
if (count == outputs.length) {
|
||||
final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
System.arraycopy(outputs, 0, next, 0, count);
|
||||
outputs = next;
|
||||
}
|
||||
if (outputs[count] == null) {
|
||||
outputs[count] = new CharsRef();
|
||||
}
|
||||
outputs[count].copy(output, offset, len);
|
||||
count++;
|
||||
}
|
||||
};
|
||||
|
||||
private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
|
||||
|
||||
// Rolling buffer, holding stack of pending synonym
|
||||
// outputs, indexed by position:
|
||||
private final PendingOutputs[] futureOutputs;
|
||||
|
||||
// Where (in rolling buffers) to write next input saved state:
|
||||
private int nextWrite;
|
||||
|
||||
// Where (in rolling buffers) to read next input saved state:
|
||||
private int nextRead;
|
||||
|
||||
// True once we've read last token
|
||||
private boolean finished;
|
||||
|
||||
private final FST.Arc<BytesRef> scratchArc;
|
||||
|
||||
private final FST<BytesRef> fst;
|
||||
|
||||
private final BytesRef scratchBytes = new BytesRef();
|
||||
private final CharsRef scratchChars = new CharsRef();
|
||||
|
||||
/**
|
||||
* @param input input tokenstream
|
||||
* @param synonyms synonym map
|
||||
* @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
|
||||
* Note, if you set this to true, its your responsibility to lowercase
|
||||
* the input entries when you create the {@link SynonymMap}
|
||||
*/
|
||||
public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
|
||||
super(input);
|
||||
this.synonyms = synonyms;
|
||||
this.ignoreCase = ignoreCase;
|
||||
this.fst = synonyms.fst;
|
||||
|
||||
if (fst == null) {
|
||||
throw new IllegalArgumentException("fst must be non-null");
|
||||
}
|
||||
|
||||
// Must be 1+ so that when roll buffer is at full
|
||||
// lookahead we can distinguish this full buffer from
|
||||
// the empty buffer:
|
||||
rollBufferSize = 1+synonyms.maxHorizontalContext;
|
||||
|
||||
futureInputs = new PendingInput[rollBufferSize];
|
||||
futureOutputs = new PendingOutputs[rollBufferSize];
|
||||
for(int pos=0;pos<rollBufferSize;pos++) {
|
||||
futureInputs[pos] = new PendingInput();
|
||||
futureOutputs[pos] = new PendingOutputs();
|
||||
}
|
||||
|
||||
//System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);
|
||||
|
||||
scratchArc = new FST.Arc<BytesRef>();
|
||||
}
|
||||
|
||||
private void capture() {
|
||||
captureCount++;
|
||||
//System.out.println(" capture slot=" + nextWrite);
|
||||
final PendingInput input = futureInputs[nextWrite];
|
||||
|
||||
input.state = captureState();
|
||||
input.consumed = false;
|
||||
input.term.copy(termAtt.buffer(), 0, termAtt.length());
|
||||
|
||||
nextWrite = rollIncr(nextWrite);
|
||||
|
||||
// Buffer head should never catch up to tail:
|
||||
assert nextWrite != nextRead;
|
||||
}
|
||||
|
||||
/*
|
||||
This is the core of this TokenFilter: it locates the
|
||||
synonym matches and buffers up the results into
|
||||
futureInputs/Outputs.
|
||||
|
||||
NOTE: this calls input.incrementToken and does not
|
||||
capture the state if no further tokens were checked. So
|
||||
caller must then forward state to our caller, or capture:
|
||||
*/
|
||||
|
||||
private void parse() throws IOException {
|
||||
//System.out.println("\nS: parse");
|
||||
|
||||
assert inputSkipCount == 0;
|
||||
|
||||
int curNextRead = nextRead;
|
||||
|
||||
// Holds the longest match we've seen so far:
|
||||
BytesRef matchOutput = null;
|
||||
int matchInputLength = 0;
|
||||
|
||||
BytesRef pendingOutput = fst.outputs.getNoOutput();
|
||||
fst.getFirstArc(scratchArc);
|
||||
|
||||
assert scratchArc.output == fst.outputs.getNoOutput();
|
||||
|
||||
int tokenCount = 0;
|
||||
|
||||
byToken:
|
||||
while(true) {
|
||||
|
||||
// Pull next token's chars:
|
||||
final char[] buffer;
|
||||
final int bufferLen;
|
||||
//System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
|
||||
|
||||
if (curNextRead == nextWrite) {
|
||||
|
||||
// We used up our lookahead buffer of input tokens
|
||||
// -- pull next real input token:
|
||||
|
||||
if (finished) {
|
||||
break;
|
||||
} else {
|
||||
//System.out.println(" input.incrToken");
|
||||
assert futureInputs[nextWrite].consumed;
|
||||
// Not correct: a syn match whose output is longer
|
||||
// than its input can set future inputs keepOrig
|
||||
// to true:
|
||||
//assert !futureInputs[nextWrite].keepOrig;
|
||||
if (input.incrementToken()) {
|
||||
buffer = termAtt.buffer();
|
||||
bufferLen = termAtt.length();
|
||||
final PendingInput input = futureInputs[nextWrite];
|
||||
input.startOffset = offsetAtt.startOffset();
|
||||
input.endOffset = offsetAtt.endOffset();
|
||||
//System.out.println(" new token=" + new String(buffer, 0, bufferLen));
|
||||
if (nextRead != nextWrite) {
|
||||
capture();
|
||||
} else {
|
||||
input.consumed = false;
|
||||
}
|
||||
|
||||
} else {
|
||||
// No more input tokens
|
||||
//System.out.println(" set end");
|
||||
finished = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Still in our lookahead
|
||||
buffer = futureInputs[curNextRead].term.chars;
|
||||
bufferLen = futureInputs[curNextRead].term.length;
|
||||
//System.out.println(" old token=" + new String(buffer, 0, bufferLen));
|
||||
}
|
||||
|
||||
tokenCount++;
|
||||
|
||||
// Run each char in this token through the FST:
|
||||
int bufUpto = 0;
|
||||
while(bufUpto < bufferLen) {
|
||||
final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
|
||||
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) {
|
||||
//System.out.println(" stop");
|
||||
break byToken;
|
||||
}
|
||||
|
||||
// Accum the output
|
||||
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
|
||||
//System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
|
||||
bufUpto += Character.charCount(codePoint);
|
||||
}
|
||||
|
||||
// OK, entire token matched; now see if this is a final
|
||||
// state:
|
||||
if (scratchArc.isFinal()) {
|
||||
matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
|
||||
matchInputLength = tokenCount;
|
||||
//System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
|
||||
}
|
||||
|
||||
// See if the FST wants to continue matching (ie, needs to
|
||||
// see the next input token):
|
||||
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
|
||||
// No further rules can match here; we're done
|
||||
// searching for matching rules starting at the
|
||||
// current input position.
|
||||
break;
|
||||
} else {
|
||||
// More matching is possible -- accum the output (if
|
||||
// any) of the WORD_SEP arc:
|
||||
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
|
||||
if (nextRead == nextWrite) {
|
||||
capture();
|
||||
}
|
||||
}
|
||||
|
||||
curNextRead = rollIncr(curNextRead);
|
||||
}
|
||||
|
||||
if (nextRead == nextWrite && !finished) {
|
||||
//System.out.println(" skip write slot=" + nextWrite);
|
||||
nextWrite = rollIncr(nextWrite);
|
||||
}
|
||||
|
||||
if (matchOutput != null) {
|
||||
//System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
|
||||
inputSkipCount = matchInputLength;
|
||||
addOutput(matchOutput);
|
||||
} else if (nextRead != nextWrite) {
|
||||
// Even though we had no match here, we set to 1
|
||||
// because we need to skip current input token before
|
||||
// trying to match again:
|
||||
inputSkipCount = 1;
|
||||
} else {
|
||||
assert finished;
|
||||
}
|
||||
|
||||
//System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
|
||||
}
|
||||
|
||||
// Interleaves all output tokens onto the futureOutputs:
|
||||
private void addOutput(BytesRef bytes) {
|
||||
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
|
||||
|
||||
final int code = bytesReader.readVInt();
|
||||
final boolean keepOrig = (code & 0x1) == 0;
|
||||
final int count = code >>> 1;
|
||||
//System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig);
|
||||
for(int outputIDX=0;outputIDX<count;outputIDX++) {
|
||||
synonyms.words.get(bytesReader.readVInt(),
|
||||
scratchBytes);
|
||||
//System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
|
||||
UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
|
||||
int lastStart = scratchChars.offset;
|
||||
final int chEnd = lastStart + scratchChars.length;
|
||||
int outputUpto = nextRead;
|
||||
for(int chIDX=lastStart;chIDX<=chEnd;chIDX++) {
|
||||
if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
|
||||
final int outputLen = chIDX - lastStart;
|
||||
// Caller is not allowed to have empty string in
|
||||
// the output:
|
||||
assert outputLen > 0: "output contains empty string: " + scratchChars;
|
||||
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
|
||||
//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
|
||||
lastStart = 1+chIDX;
|
||||
futureInputs[outputUpto].keepOrig |= keepOrig;
|
||||
//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
|
||||
outputUpto = rollIncr(outputUpto);
|
||||
assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ++ mod rollBufferSize
|
||||
private int rollIncr(int count) {
|
||||
count++;
|
||||
if (count == rollBufferSize) {
|
||||
return 0;
|
||||
} else {
|
||||
return count;
|
||||
}
|
||||
}
|
||||
|
||||
// for testing
|
||||
int getCaptureCount() {
|
||||
return captureCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
|
||||
//System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
|
||||
|
||||
while(true) {
|
||||
|
||||
// First play back any buffered future inputs/outputs
|
||||
// w/o running parsing again:
|
||||
while (inputSkipCount != 0) {
|
||||
|
||||
// At each position, we first output the original
|
||||
// token
|
||||
|
||||
// TODO: maybe just a PendingState class, holding
|
||||
// both input & outputs?
|
||||
final PendingInput input = futureInputs[nextRead];
|
||||
final PendingOutputs outputs = futureOutputs[nextRead];
|
||||
|
||||
//System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);
|
||||
|
||||
if (!input.consumed && (input.keepOrig || outputs.count == 0)) {
|
||||
if (input.state != null) {
|
||||
// Return a previously saved token (because we
|
||||
// had to lookahead):
|
||||
restoreState(input.state);
|
||||
} else {
|
||||
// Pass-through case: return token we just pulled
|
||||
// but didn't capture:
|
||||
assert inputSkipCount == 1: "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
|
||||
}
|
||||
input.reset();
|
||||
if (outputs.count > 0) {
|
||||
outputs.posIncr = 0;
|
||||
} else {
|
||||
nextRead = rollIncr(nextRead);
|
||||
inputSkipCount--;
|
||||
}
|
||||
//System.out.println(" return token=" + termAtt.toString());
|
||||
return true;
|
||||
} else if (outputs.upto < outputs.count) {
|
||||
// Still have pending outputs to replay at this
|
||||
// position
|
||||
input.reset();
|
||||
final int posIncr = outputs.posIncr;
|
||||
final CharsRef output = outputs.pullNext();
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(output.chars, output.offset, output.length);
|
||||
typeAtt.setType(TYPE_SYNONYM);
|
||||
offsetAtt.setOffset(input.startOffset, input.endOffset);
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
if (outputs.count == 0) {
|
||||
// Done with the buffered input and all outputs at
|
||||
// this position
|
||||
nextRead = rollIncr(nextRead);
|
||||
inputSkipCount--;
|
||||
}
|
||||
//System.out.println(" return token=" + termAtt.toString());
|
||||
return true;
|
||||
} else {
|
||||
// Done with the buffered input and all outputs at
|
||||
// this position
|
||||
input.reset();
|
||||
nextRead = rollIncr(nextRead);
|
||||
inputSkipCount--;
|
||||
}
|
||||
}
|
||||
|
||||
if (finished && nextRead == nextWrite) {
|
||||
// End case: if any output syns went beyond end of
|
||||
// input stream, enumerate them now:
|
||||
final PendingOutputs outputs = futureOutputs[nextRead];
|
||||
if (outputs.upto < outputs.count) {
|
||||
final int posIncr = outputs.posIncr;
|
||||
final CharsRef output = outputs.pullNext();
|
||||
futureInputs[nextRead].reset();
|
||||
if (outputs.count == 0) {
|
||||
nextWrite = nextRead = rollIncr(nextRead);
|
||||
}
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(output.chars, output.offset, output.length);
|
||||
typeAtt.setType(TYPE_SYNONYM);
|
||||
//System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs);
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
//System.out.println(" return token=" + termAtt.toString());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Find new synonym matches:
|
||||
parse();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
captureCount = 0;
|
||||
finished = false;
|
||||
|
||||
// In normal usage these resets would not be needed,
|
||||
// since they reset-as-they-are-consumed, but the app
|
||||
// may not consume all input tokens in which case we
|
||||
// have leftover state here:
|
||||
for (PendingInput input : futureInputs) {
|
||||
input.reset();
|
||||
replacement = null;
|
||||
exhausted = false;
|
||||
}
|
||||
for (PendingOutputs output : futureOutputs) {
|
||||
output.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -15,146 +17,301 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.synonym;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/** Mapping rules for use with {@link SynonymFilter}
|
||||
/**
|
||||
* A map of synonyms, keys and values are phrases.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SynonymMap {
|
||||
/** @lucene.internal */
|
||||
public CharArrayMap<SynonymMap> submap; // recursive: Map<String, SynonymMap>
|
||||
/** @lucene.internal */
|
||||
public Token[] synonyms;
|
||||
int flags;
|
||||
/** for multiword support, you must separate words with this separator */
|
||||
public static final char WORD_SEPARATOR = 0;
|
||||
/** map<input word, list<ord>> */
|
||||
public final FST<BytesRef> fst;
|
||||
/** map<ord, outputword> */
|
||||
public final BytesRefHash words;
|
||||
/** maxHorizontalContext: maximum context we need on the tokenstream */
|
||||
public final int maxHorizontalContext;
|
||||
|
||||
static final int INCLUDE_ORIG=0x01;
|
||||
static final int IGNORE_CASE=0x02;
|
||||
|
||||
public SynonymMap() {}
|
||||
public SynonymMap(boolean ignoreCase) {
|
||||
if (ignoreCase) flags |= IGNORE_CASE;
|
||||
public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext) {
|
||||
this.fst = fst;
|
||||
this.words = words;
|
||||
this.maxHorizontalContext = maxHorizontalContext;
|
||||
}
|
||||
|
||||
public boolean includeOrig() { return (flags & INCLUDE_ORIG) != 0; }
|
||||
public boolean ignoreCase() { return (flags & IGNORE_CASE) != 0; }
|
||||
|
||||
/**
|
||||
* @param singleMatch List<String>, the sequence of strings to match
|
||||
* @param replacement List<Token> the list of tokens to use on a match
|
||||
* @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
|
||||
* @param mergeExisting merge the replacement tokens with any other mappings that exist
|
||||
* Builds an FSTSynonymMap.
|
||||
* <p>
|
||||
* Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
|
||||
SynonymMap currMap = this;
|
||||
for (String str : singleMatch) {
|
||||
if (currMap.submap==null) {
|
||||
// for now hardcode at 4.0, as its what the old code did.
|
||||
// would be nice to fix, but shouldn't store a version in each submap!!!
|
||||
currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_40, 1, ignoreCase());
|
||||
public static class Builder {
|
||||
private final HashMap<CharsRef,MapEntry> workingSet = new HashMap<CharsRef,MapEntry>();
|
||||
private final BytesRefHash words = new BytesRefHash();
|
||||
private final BytesRef utf8Scratch = new BytesRef(8);
|
||||
private int maxHorizontalContext;
|
||||
private final boolean dedup;
|
||||
|
||||
/** If dedup is true then identical rules (same input,
|
||||
* same output) will be added only once. */
|
||||
public Builder(boolean dedup) {
|
||||
this.dedup = dedup;
|
||||
}
|
||||
|
||||
SynonymMap map = currMap.submap.get(str);
|
||||
if (map==null) {
|
||||
map = new SynonymMap();
|
||||
map.flags |= flags & IGNORE_CASE;
|
||||
currMap.submap.put(str, map);
|
||||
private static class MapEntry {
|
||||
boolean includeOrig;
|
||||
// we could sort for better sharing ultimately, but it could confuse people
|
||||
ArrayList<Integer> ords = new ArrayList<Integer>();
|
||||
}
|
||||
|
||||
currMap = map;
|
||||
/** Sugar: just joins the provided terms with {@link
|
||||
* SynonymMap#WORD_SEPARATOR}. reuse and its chars
|
||||
* must not be null. */
|
||||
public static CharsRef join(String[] words, CharsRef reuse) {
|
||||
int upto = 0;
|
||||
char[] buffer = reuse.chars;
|
||||
for(String word : words) {
|
||||
if (upto > 0) {
|
||||
if (upto >= buffer.length) {
|
||||
reuse.grow(upto);
|
||||
buffer = reuse.chars;
|
||||
}
|
||||
buffer[upto++] = SynonymMap.WORD_SEPARATOR;
|
||||
}
|
||||
|
||||
if (currMap.synonyms != null && !mergeExisting) {
|
||||
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
|
||||
}
|
||||
List<Token> superset = currMap.synonyms==null ? replacement :
|
||||
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
|
||||
currMap.synonyms = superset.toArray(new Token[superset.size()]);
|
||||
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
|
||||
final int wordLen = word.length();
|
||||
final int needed = upto + wordLen;
|
||||
if (needed > buffer.length) {
|
||||
reuse.grow(needed);
|
||||
buffer = reuse.chars;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder("<");
|
||||
if (synonyms!=null) {
|
||||
sb.append("[");
|
||||
for (int i=0; i<synonyms.length; i++) {
|
||||
if (i!=0) sb.append(',');
|
||||
sb.append(synonyms[i]);
|
||||
}
|
||||
if ((flags & INCLUDE_ORIG)!=0) {
|
||||
sb.append(",ORIG");
|
||||
}
|
||||
sb.append("],");
|
||||
}
|
||||
sb.append(submap);
|
||||
sb.append(">");
|
||||
return sb.toString();
|
||||
word.getChars(0, wordLen, buffer, upto);
|
||||
upto += wordLen;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** Produces a List<Token> from a List<String> */
|
||||
public static List<Token> makeTokens(List<String> strings) {
|
||||
List<Token> ret = new ArrayList<Token>(strings.size());
|
||||
for (String str : strings) {
|
||||
//Token newTok = new Token(str,0,0,"SYNONYM");
|
||||
Token newTok = new Token(str, 0,0,"SYNONYM");
|
||||
ret.add(newTok);
|
||||
}
|
||||
return ret;
|
||||
return reuse;
|
||||
}
|
||||
|
||||
/** Sugar: analyzes the text with the analyzer and
|
||||
* separates by {@link SynonymMap#WORD_SEPARATOR}.
|
||||
* reuse and its chars must not be null. */
|
||||
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
|
||||
TokenStream ts = analyzer.reusableTokenStream("", new StringReader(text));
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
ts.reset();
|
||||
reuse.length = 0;
|
||||
while (ts.incrementToken()) {
|
||||
int length = termAtt.length();
|
||||
if (length == 0) {
|
||||
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
|
||||
}
|
||||
if (posIncAtt.getPositionIncrement() != 1) {
|
||||
throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
|
||||
}
|
||||
reuse.grow(reuse.length + length + 1); /* current + word + separator */
|
||||
int end = reuse.offset + reuse.length;
|
||||
if (reuse.length > 0) {
|
||||
reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
|
||||
reuse.length++;
|
||||
}
|
||||
System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
|
||||
reuse.length += length;
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
if (reuse.length == 0) {
|
||||
throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
|
||||
}
|
||||
return reuse;
|
||||
}
|
||||
|
||||
/** only used for asserting! */
|
||||
private boolean hasHoles(CharsRef chars) {
|
||||
final int end = chars.offset + chars.length;
|
||||
for(int idx=chars.offset+1;idx<end;idx++) {
|
||||
if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (chars.chars[chars.offset] == '\u0000') {
|
||||
return true;
|
||||
}
|
||||
if (chars.chars[chars.offset + chars.length - 1] == '\u0000') {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// NOTE: while it's tempting to make this public, since
|
||||
// caller's parser likely knows the
|
||||
// numInput/numOutputWords, sneaky exceptions, much later
|
||||
// on, will result if these values are wrong; so we always
|
||||
// recompute ourselves to be safe:
|
||||
private void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) {
|
||||
// first convert to UTF-8
|
||||
if (numInputWords <= 0) {
|
||||
throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
|
||||
}
|
||||
if (input.length <= 0) {
|
||||
throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")");
|
||||
}
|
||||
if (numOutputWords <= 0) {
|
||||
throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
|
||||
}
|
||||
if (output.length <= 0) {
|
||||
throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")");
|
||||
}
|
||||
|
||||
assert !hasHoles(input): "input has holes: " + input;
|
||||
assert !hasHoles(output): "output has holes: " + output;
|
||||
|
||||
//System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
|
||||
final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch);
|
||||
// lookup in hash
|
||||
int ord = words.add(utf8Scratch, hashCode);
|
||||
if (ord < 0) {
|
||||
// already exists in our hash
|
||||
ord = (-ord)-1;
|
||||
//System.out.println(" output=" + output + " old ord=" + ord);
|
||||
} else {
|
||||
//System.out.println(" output=" + output + " new ord=" + ord);
|
||||
}
|
||||
|
||||
MapEntry e = workingSet.get(input);
|
||||
if (e == null) {
|
||||
e = new MapEntry();
|
||||
workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map
|
||||
}
|
||||
|
||||
e.ords.add(ord);
|
||||
e.includeOrig |= includeOrig;
|
||||
maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords);
|
||||
maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords);
|
||||
}
|
||||
|
||||
private int countWords(CharsRef chars) {
|
||||
int wordCount = 1;
|
||||
int upto = chars.offset;
|
||||
final int limit = chars.offset + chars.length;
|
||||
while(upto < limit) {
|
||||
if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) {
|
||||
wordCount++;
|
||||
}
|
||||
}
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
|
||||
* the tokens end up at the same position.
|
||||
*
|
||||
* Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
|
||||
* Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
|
||||
* Add a phrase->phrase synonym mapping.
|
||||
* Phrases are character sequences where words are
|
||||
* separated with character zero (\u0000). Empty words
|
||||
* (two \u0000s in a row) are not allowed in the input nor
|
||||
* the output!
|
||||
*
|
||||
* @param input input phrase
|
||||
* @param output output phrase
|
||||
* @param includeOrig true if the original should be included
|
||||
*/
|
||||
public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
|
||||
ArrayList<Token> result = new ArrayList<Token>();
|
||||
if (lst1 ==null || lst2 ==null) {
|
||||
if (lst2 != null) result.addAll(lst2);
|
||||
if (lst1 != null) result.addAll(lst1);
|
||||
return result;
|
||||
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
|
||||
add(input, countWords(input), output, countWords(output), includeOrig);
|
||||
}
|
||||
|
||||
int pos=0;
|
||||
Iterator<Token> iter1=lst1.iterator();
|
||||
Iterator<Token> iter2=lst2.iterator();
|
||||
Token tok1 = iter1.hasNext() ? iter1.next() : null;
|
||||
Token tok2 = iter2.hasNext() ? iter2.next() : null;
|
||||
int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0;
|
||||
int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0;
|
||||
while(tok1!=null || tok2!=null) {
|
||||
while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
|
||||
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
|
||||
tok.copyBuffer(tok1.buffer(), 0, tok1.length());
|
||||
tok.setPositionIncrement(pos1-pos);
|
||||
result.add(tok);
|
||||
pos=pos1;
|
||||
tok1 = iter1.hasNext() ? iter1.next() : null;
|
||||
pos1 += tok1!=null ? tok1.getPositionIncrement() : 0;
|
||||
}
|
||||
while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
|
||||
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
|
||||
tok.copyBuffer(tok2.buffer(), 0, tok2.length());
|
||||
tok.setPositionIncrement(pos2-pos);
|
||||
result.add(tok);
|
||||
pos=pos2;
|
||||
tok2 = iter2.hasNext() ? iter2.next() : null;
|
||||
pos2 += tok2!=null ? tok2.getPositionIncrement() : 0;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
/**
|
||||
* Builds an {@link SynonymMap} and returns it.
|
||||
*/
|
||||
public SynonymMap build() throws IOException {
|
||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
// TODO: are we using the best sharing options?
|
||||
org.apache.lucene.util.fst.Builder<BytesRef> builder =
|
||||
new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
|
||||
|
||||
BytesRef scratch = new BytesRef(64);
|
||||
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
|
||||
|
||||
final Set<Integer> dedupSet;
|
||||
|
||||
if (dedup) {
|
||||
dedupSet = new HashSet<Integer>();
|
||||
} else {
|
||||
dedupSet = null;
|
||||
}
|
||||
|
||||
final byte[] spare = new byte[5];
|
||||
|
||||
Set<CharsRef> keys = workingSet.keySet();
|
||||
CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
|
||||
Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
|
||||
|
||||
//System.out.println("fmap.build");
|
||||
for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
|
||||
CharsRef input = sortedKeys[keyIdx];
|
||||
MapEntry output = workingSet.get(input);
|
||||
|
||||
int numEntries = output.ords.size();
|
||||
// output size, assume the worst case
|
||||
int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
|
||||
|
||||
scratch.grow(estimatedSize);
|
||||
scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
|
||||
assert scratch.offset == 0;
|
||||
|
||||
// now write our output data:
|
||||
int count = 0;
|
||||
for (int i = 0; i < numEntries; i++) {
|
||||
if (dedupSet != null) {
|
||||
// box once
|
||||
final Integer ent = output.ords.get(i);
|
||||
if (dedupSet.contains(ent)) {
|
||||
continue;
|
||||
}
|
||||
dedupSet.add(ent);
|
||||
}
|
||||
scratchOutput.writeVInt(output.ords.get(i));
|
||||
count++;
|
||||
}
|
||||
|
||||
final int pos = scratchOutput.getPosition();
|
||||
scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
|
||||
final int pos2 = scratchOutput.getPosition();
|
||||
final int vIntLen = pos2-pos;
|
||||
|
||||
// Move the count + includeOrig to the front of the byte[]:
|
||||
System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen);
|
||||
System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
|
||||
System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen);
|
||||
|
||||
if (dedupSet != null) {
|
||||
dedupSet.clear();
|
||||
}
|
||||
|
||||
scratch.length = scratchOutput.getPosition() - scratch.offset;
|
||||
//System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
|
||||
builder.add(input, new BytesRef(scratch));
|
||||
}
|
||||
|
||||
FST<BytesRef> fst = builder.finish();
|
||||
return new SynonymMap(fst, words, maxHorizontalContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Reader;
|
||||
import java.text.ParseException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* Parser for wordnet prolog format
|
||||
* <p>
|
||||
* See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
|
||||
public class WordnetSynonymParser extends SynonymMap.Builder {
|
||||
private final boolean expand;
|
||||
private final Analyzer analyzer;
|
||||
|
||||
public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
|
||||
super(dedup);
|
||||
this.expand = expand;
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
public void add(Reader in) throws IOException, ParseException {
|
||||
LineNumberReader br = new LineNumberReader(in);
|
||||
try {
|
||||
String line = null;
|
||||
String lastSynSetID = "";
|
||||
CharsRef synset[] = new CharsRef[8];
|
||||
int synsetSize = 0;
|
||||
|
||||
while ((line = br.readLine()) != null) {
|
||||
String synSetID = line.substring(2, 11);
|
||||
|
||||
if (!synSetID.equals(lastSynSetID)) {
|
||||
addInternal(synset, synsetSize);
|
||||
synsetSize = 0;
|
||||
}
|
||||
|
||||
if (synset.length <= synsetSize+1) {
|
||||
CharsRef larger[] = new CharsRef[synset.length * 2];
|
||||
System.arraycopy(synset, 0, larger, 0, synsetSize);
|
||||
synset = larger;
|
||||
}
|
||||
|
||||
synset[synsetSize] = parseSynonym(line, synset[synsetSize]);
|
||||
synsetSize++;
|
||||
lastSynSetID = synSetID;
|
||||
}
|
||||
|
||||
// final synset in the file
|
||||
addInternal(synset, synsetSize);
|
||||
} catch (IllegalArgumentException e) {
|
||||
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
|
||||
ex.initCause(e);
|
||||
throw ex;
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
|
||||
private CharsRef parseSynonym(String line, CharsRef reuse) throws IOException {
|
||||
if (reuse == null) {
|
||||
reuse = new CharsRef(8);
|
||||
}
|
||||
|
||||
int start = line.indexOf('\'')+1;
|
||||
int end = line.lastIndexOf('\'');
|
||||
|
||||
String text = line.substring(start, end).replace("''", "'");
|
||||
return analyze(analyzer, text, reuse);
|
||||
}
|
||||
|
||||
private void addInternal(CharsRef synset[], int size) throws IOException {
|
||||
if (size <= 1) {
|
||||
return; // nothing to do
|
||||
}
|
||||
|
||||
if (expand) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (int j = 0; j < size; j++) {
|
||||
add(synset[i], synset[j], false);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < size; i++) {
|
||||
add(synset[i], synset[0], false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -14,13 +15,8 @@
|
|||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<title>
|
||||
wordnet
|
||||
</title>
|
||||
</head>
|
||||
<body>
|
||||
wordnet
|
||||
</body>
|
||||
<html><head></head>
|
||||
<body>
|
||||
Analysis components for Synonyms.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,144 @@
|
|||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.text.ParseException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Tests parser for the Solr synonyms format
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
|
||||
|
||||
/** Tests some simple examples from the solr wiki */
|
||||
public void testSimple() throws Exception {
|
||||
String testFile =
|
||||
"i-pod, ipod, ipoooood\n" +
|
||||
"foo => foo bar\n" +
|
||||
"foo => baz\n" +
|
||||
"this test, that testing";
|
||||
|
||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
|
||||
parser.add(new StringReader(testFile));
|
||||
final SynonymMap map = parser.build();
|
||||
|
||||
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
|
||||
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "ball",
|
||||
new String[] { "ball" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "i-pod",
|
||||
new String[] { "i-pod", "ipod", "ipoooood" },
|
||||
new int[] { 1, 0, 0 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "foo",
|
||||
new String[] { "foo", "baz", "bar" },
|
||||
new int[] { 1, 0, 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "this test",
|
||||
new String[] { "this", "that", "test", "testing" },
|
||||
new int[] { 1, 0, 1, 0 });
|
||||
}
|
||||
|
||||
/** parse a syn file with bad syntax */
|
||||
@Test(expected=ParseException.class)
|
||||
public void testInvalidDoubleMap() throws Exception {
|
||||
String testFile = "a => b => c";
|
||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
|
||||
parser.add(new StringReader(testFile));
|
||||
}
|
||||
|
||||
/** parse a syn file with bad syntax */
|
||||
@Test(expected=ParseException.class)
|
||||
public void testInvalidAnalyzesToNothingOutput() throws Exception {
|
||||
String testFile = "a => 1";
|
||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
|
||||
parser.add(new StringReader(testFile));
|
||||
}
|
||||
|
||||
/** parse a syn file with bad syntax */
|
||||
@Test(expected=ParseException.class)
|
||||
public void testInvalidAnalyzesToNothingInput() throws Exception {
|
||||
String testFile = "1 => a";
|
||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
|
||||
parser.add(new StringReader(testFile));
|
||||
}
|
||||
|
||||
/** parse a syn file with bad syntax */
|
||||
@Test(expected=ParseException.class)
|
||||
public void testInvalidPositionsInput() throws Exception {
|
||||
String testFile = "testola => the test";
|
||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
|
||||
parser.add(new StringReader(testFile));
|
||||
}
|
||||
|
||||
/** parse a syn file with bad syntax */
|
||||
@Test(expected=ParseException.class)
|
||||
public void testInvalidPositionsOutput() throws Exception {
|
||||
String testFile = "the test => testola";
|
||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
|
||||
parser.add(new StringReader(testFile));
|
||||
}
|
||||
|
||||
/** parse a syn file with some escaped syntax chars */
|
||||
public void testEscapedStuff() throws Exception {
|
||||
String testFile =
|
||||
"a\\=>a => b\\=>b\n" +
|
||||
"a\\,a => b\\,b";
|
||||
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.KEYWORD, false));
|
||||
parser.add(new StringReader(testFile));
|
||||
final SynonymMap map = parser.build();
|
||||
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "ball",
|
||||
new String[] { "ball" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "a=>a",
|
||||
new String[] { "b=>b" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "a,a",
|
||||
new String[] { "b,b" },
|
||||
new int[] { 1 });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,393 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
private SynonymMap.Builder b;
|
||||
private Tokenizer tokensIn;
|
||||
private SynonymFilter tokensOut;
|
||||
private CharTermAttribute termAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
|
||||
private void add(String input, String output, boolean keepOrig) {
|
||||
b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
|
||||
new CharsRef(output.replaceAll(" +", "\u0000")),
|
||||
keepOrig);
|
||||
}
|
||||
|
||||
private void assertEquals(CharTermAttribute term, String expected) {
|
||||
assertEquals(expected.length(), term.length());
|
||||
final char[] buffer = term.buffer();
|
||||
for(int chIDX=0;chIDX<expected.length();chIDX++) {
|
||||
assertEquals(expected.charAt(chIDX), buffer[chIDX]);
|
||||
}
|
||||
}
|
||||
|
||||
private void verify(String input, String output) throws Exception {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: verify input=" + input + " expectedOutput=" + output);
|
||||
}
|
||||
|
||||
tokensIn.reset(new StringReader(input));
|
||||
tokensOut.reset();
|
||||
final String[] expected = output.split(" ");
|
||||
int expectedUpto = 0;
|
||||
while(tokensOut.incrementToken()) {
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
|
||||
}
|
||||
|
||||
assertTrue(expectedUpto < expected.length);
|
||||
final int startOffset = offsetAtt.startOffset();
|
||||
final int endOffset = offsetAtt.endOffset();
|
||||
|
||||
final String[] expectedAtPos = expected[expectedUpto++].split("/");
|
||||
for(int atPos=0;atPos<expectedAtPos.length;atPos++) {
|
||||
if (atPos > 0) {
|
||||
assertTrue(tokensOut.incrementToken());
|
||||
if (VERBOSE) {
|
||||
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
|
||||
}
|
||||
}
|
||||
assertEquals(termAtt, expectedAtPos[atPos]);
|
||||
assertEquals(atPos == 0 ? 1 : 0,
|
||||
posIncrAtt.getPositionIncrement());
|
||||
// start/end offset of all tokens at same pos should
|
||||
// be the same:
|
||||
assertEquals(startOffset, offsetAtt.startOffset());
|
||||
assertEquals(endOffset, offsetAtt.endOffset());
|
||||
}
|
||||
}
|
||||
tokensOut.end();
|
||||
tokensOut.close();
|
||||
if (VERBOSE) {
|
||||
System.out.println(" incr: END");
|
||||
}
|
||||
assertEquals(expectedUpto, expected.length);
|
||||
}
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
b = new SynonymMap.Builder(true);
|
||||
add("a", "foo", true);
|
||||
add("a b", "bar fee", true);
|
||||
add("b c", "dog collar", true);
|
||||
add("c d", "dog harness holder extras", true);
|
||||
add("m c e", "dog barks loudly", false);
|
||||
|
||||
add("e f", "foo bar", false);
|
||||
add("e f", "baz bee", false);
|
||||
|
||||
add("z", "boo", false);
|
||||
add("y", "bee", true);
|
||||
|
||||
tokensIn = new MockTokenizer(new StringReader("a"),
|
||||
MockTokenizer.WHITESPACE,
|
||||
true);
|
||||
tokensIn.reset();
|
||||
assertTrue(tokensIn.incrementToken());
|
||||
assertFalse(tokensIn.incrementToken());
|
||||
tokensIn.end();
|
||||
tokensIn.close();
|
||||
|
||||
tokensOut = new SynonymFilter(tokensIn,
|
||||
b.build(),
|
||||
true);
|
||||
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
|
||||
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
|
||||
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
|
||||
|
||||
verify("a b c", "a/bar b/fee c");
|
||||
|
||||
// syn output extends beyond input tokens
|
||||
verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");
|
||||
|
||||
verify("a b a", "a/bar b/fee a/foo");
|
||||
|
||||
// outputs that add to one another:
|
||||
verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");
|
||||
|
||||
// two outputs for same input
|
||||
verify("e f", "foo/baz bar/bee");
|
||||
|
||||
// mixed keepOrig true/false:
|
||||
verify("a m c e x", "a/foo dog barks loudly x");
|
||||
verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x");
|
||||
assertTrue(tokensOut.getCaptureCount() > 0);
|
||||
|
||||
// no captureStates when no syns matched
|
||||
verify("p q r s t", "p q r s t");
|
||||
assertEquals(0, tokensOut.getCaptureCount());
|
||||
|
||||
// no captureStates when only single-input syns, w/ no
|
||||
// lookahead needed, matched
|
||||
verify("p q z y t", "p q boo y/bee t");
|
||||
assertEquals(0, tokensOut.getCaptureCount());
|
||||
}
|
||||
|
||||
private String getRandomString(char start, int alphabetSize, int length) {
|
||||
assert alphabetSize <= 26;
|
||||
char[] s = new char[2*length];
|
||||
for(int charIDX=0;charIDX<length;charIDX++) {
|
||||
s[2*charIDX] = (char) (start + random.nextInt(alphabetSize));
|
||||
s[2*charIDX+1] = ' ';
|
||||
}
|
||||
return new String(s);
|
||||
}
|
||||
|
||||
private static class OneSyn {
|
||||
String in;
|
||||
List<String> out;
|
||||
boolean keepOrig;
|
||||
}
|
||||
|
||||
public String slowSynMatcher(String doc, List<OneSyn> syns, int maxOutputLength) {
|
||||
assertTrue(doc.length() % 2 == 0);
|
||||
final int numInputs = doc.length()/2;
|
||||
boolean[] keepOrigs = new boolean[numInputs];
|
||||
Arrays.fill(keepOrigs, false);
|
||||
String[] outputs = new String[numInputs + maxOutputLength];
|
||||
OneSyn[] matches = new OneSyn[numInputs];
|
||||
for(OneSyn syn : syns) {
|
||||
int idx = -1;
|
||||
while(true) {
|
||||
idx = doc.indexOf(syn.in, 1+idx);
|
||||
if (idx == -1) {
|
||||
break;
|
||||
}
|
||||
assertTrue(idx % 2 == 0);
|
||||
final int matchIDX = idx/2;
|
||||
assertTrue(syn.in.length() % 2 == 1);
|
||||
if (matches[matchIDX] == null) {
|
||||
matches[matchIDX] = syn;
|
||||
} else if (syn.in.length() > matches[matchIDX].in.length()) {
|
||||
// Greedy conflict resolution: longer match wins:
|
||||
matches[matchIDX] = syn;
|
||||
} else {
|
||||
assertTrue(syn.in.length() < matches[matchIDX].in.length());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Greedy conflict resolution: if syn matches a range of inputs,
|
||||
// it prevents other syns from matching that range
|
||||
for(int inputIDX=0;inputIDX<numInputs;inputIDX++) {
|
||||
final OneSyn match = matches[inputIDX];
|
||||
if (match != null) {
|
||||
final int synInLength = (1+match.in.length())/2;
|
||||
for(int nextInputIDX=inputIDX+1;nextInputIDX<numInputs && nextInputIDX<(inputIDX+synInLength);nextInputIDX++) {
|
||||
matches[nextInputIDX] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fill overlapping outputs:
|
||||
for(int inputIDX=0;inputIDX<numInputs;inputIDX++) {
|
||||
final OneSyn syn = matches[inputIDX];
|
||||
if (syn == null) {
|
||||
continue;
|
||||
}
|
||||
for(String synOut : syn.out) {
|
||||
final String[] synOutputs = synOut.split(" ");
|
||||
assertEquals(synOutputs.length, (1+synOut.length())/2);
|
||||
final int matchEnd = inputIDX + synOutputs.length;
|
||||
int synUpto = 0;
|
||||
for(int matchIDX=inputIDX;matchIDX<matchEnd;matchIDX++) {
|
||||
if (outputs[matchIDX] == null) {
|
||||
outputs[matchIDX] = synOutputs[synUpto++];
|
||||
} else {
|
||||
outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
|
||||
}
|
||||
if (matchIDX < numInputs) {
|
||||
keepOrigs[matchIDX] |= syn.keepOrig;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String[] inputTokens = doc.split(" ");
|
||||
final int limit = inputTokens.length + maxOutputLength;
|
||||
for(int inputIDX=0;inputIDX<limit;inputIDX++) {
|
||||
boolean posHasOutput = false;
|
||||
if (inputIDX >= numInputs && outputs[inputIDX] == null) {
|
||||
break;
|
||||
}
|
||||
if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) {
|
||||
sb.append(inputTokens[inputIDX]);
|
||||
posHasOutput = true;
|
||||
}
|
||||
|
||||
if (outputs[inputIDX] != null) {
|
||||
if (posHasOutput) {
|
||||
sb.append('/');
|
||||
}
|
||||
sb.append(outputs[inputIDX]);
|
||||
}
|
||||
if (inputIDX < limit-1) {
|
||||
sb.append(' ');
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
|
||||
final int alphabetSize = _TestUtil.nextInt(random, 2, 7);
|
||||
|
||||
final int docLen = atLeast(3000);
|
||||
//final int docLen = 50;
|
||||
|
||||
final String document = getRandomString('a', alphabetSize, docLen);
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: doc=" + document);
|
||||
}
|
||||
|
||||
final int numSyn = atLeast(5);
|
||||
//final int numSyn = 2;
|
||||
|
||||
final Map<String,OneSyn> synMap = new HashMap<String,OneSyn>();
|
||||
final List<OneSyn> syns = new ArrayList<OneSyn>();
|
||||
final boolean dedup = random.nextBoolean();
|
||||
if (VERBOSE) {
|
||||
System.out.println(" dedup=" + dedup);
|
||||
}
|
||||
b = new SynonymMap.Builder(dedup);
|
||||
for(int synIDX=0;synIDX<numSyn;synIDX++) {
|
||||
final String synIn = getRandomString('a', alphabetSize, _TestUtil.nextInt(random, 1, 5)).trim();
|
||||
OneSyn s = synMap.get(synIn);
|
||||
if (s == null) {
|
||||
s = new OneSyn();
|
||||
s.in = synIn;
|
||||
syns.add(s);
|
||||
s.out = new ArrayList<String>();
|
||||
synMap.put(synIn, s);
|
||||
s.keepOrig = random.nextBoolean();
|
||||
}
|
||||
final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim();
|
||||
s.out.add(synOut);
|
||||
add(synIn, synOut, s.keepOrig);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig);
|
||||
}
|
||||
}
|
||||
|
||||
tokensIn = new MockTokenizer(new StringReader("a"),
|
||||
MockTokenizer.WHITESPACE,
|
||||
true);
|
||||
tokensIn.reset();
|
||||
assertTrue(tokensIn.incrementToken());
|
||||
assertFalse(tokensIn.incrementToken());
|
||||
tokensIn.end();
|
||||
tokensIn.close();
|
||||
|
||||
tokensOut = new SynonymFilter(tokensIn,
|
||||
b.build(),
|
||||
true);
|
||||
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
|
||||
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
|
||||
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
|
||||
|
||||
if (dedup) {
|
||||
pruneDups(syns);
|
||||
}
|
||||
|
||||
final String expected = slowSynMatcher(document, syns, 5);
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: expected=" + expected);
|
||||
}
|
||||
|
||||
verify(document, expected);
|
||||
}
|
||||
|
||||
private void pruneDups(List<OneSyn> syns) {
|
||||
Set<String> seen = new HashSet<String>();
|
||||
for(OneSyn syn : syns) {
|
||||
int idx = 0;
|
||||
while(idx < syn.out.size()) {
|
||||
String out = syn.out.get(idx);
|
||||
if (!seen.contains(out)) {
|
||||
seen.add(out);
|
||||
idx++;
|
||||
} else {
|
||||
syn.out.remove(idx);
|
||||
}
|
||||
}
|
||||
seen.clear();
|
||||
}
|
||||
}
|
||||
|
||||
private String randomNonEmptyString() {
|
||||
while(true) {
|
||||
final String s = _TestUtil.randomUnicodeString(random).trim();
|
||||
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** simple random test, doesn't verify correctness.
|
||||
* does verify it doesnt throw exceptions, or that the stream doesn't misbehave
|
||||
*/
|
||||
public void testRandom2() throws Exception {
|
||||
final int numIters = atLeast(10);
|
||||
for (int i = 0; i < numIters; i++) {
|
||||
b = new SynonymMap.Builder(random.nextBoolean());
|
||||
final int numEntries = atLeast(10);
|
||||
for (int j = 0; j < numEntries; j++) {
|
||||
add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
|
||||
}
|
||||
final SynonymMap map = b.build();
|
||||
final boolean ignoreCase = random.nextBoolean();
|
||||
|
||||
final Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
|
||||
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
|
||||
}
|
||||
};
|
||||
|
||||
checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
|
||||
public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
|
||||
Analyzer analyzer;
|
||||
|
||||
String synonymsFile =
|
||||
"s(100000001,1,'woods',n,1,0).\n" +
|
||||
"s(100000001,2,'wood',n,1,0).\n" +
|
||||
"s(100000001,3,'forest',n,1,0).\n" +
|
||||
"s(100000002,1,'wolfish',n,1,0).\n" +
|
||||
"s(100000002,2,'ravenous',n,1,0).\n" +
|
||||
"s(100000003,1,'king',n,1,1).\n" +
|
||||
"s(100000003,2,'baron',n,1,1).\n" +
|
||||
"s(100000004,1,'king''s evil',n,1,1).\n" +
|
||||
"s(100000004,2,'king''s meany',n,1,1).\n";
|
||||
|
||||
public void testSynonyms() throws Exception {
|
||||
WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random));
|
||||
parser.add(new StringReader(synonymsFile));
|
||||
final SynonymMap map = parser.build();
|
||||
|
||||
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
|
||||
}
|
||||
};
|
||||
|
||||
/* all expansions */
|
||||
assertAnalyzesTo(analyzer, "Lost in the woods",
|
||||
new String[] { "Lost", "in", "the", "woods", "wood", "forest" },
|
||||
new int[] { 0, 5, 8, 12, 12, 12 },
|
||||
new int[] { 4, 7, 11, 17, 17, 17 },
|
||||
new int[] { 1, 1, 1, 1, 0, 0 });
|
||||
|
||||
/* single quote */
|
||||
assertAnalyzesTo(analyzer, "king",
|
||||
new String[] { "king", "baron" });
|
||||
|
||||
/* multi words */
|
||||
assertAnalyzesTo(analyzer, "king's evil",
|
||||
new String[] { "king's", "king's", "evil", "meany" });
|
||||
}
|
||||
}
|
|
@ -90,6 +90,10 @@ import org.apache.lucene.store.OutputStreamDataOutput;
|
|||
*
|
||||
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
|
||||
* nothing else.
|
||||
*
|
||||
* <b>NOTE</b>: the FST file format is experimental and
|
||||
* subject to suddenly change, requiring you to rebuild the
|
||||
* FST suggest index.
|
||||
*/
|
||||
public class FSTLookup extends Lookup {
|
||||
|
||||
|
|
|
@ -320,6 +320,9 @@ New Features
|
|||
Optimizations
|
||||
----------------------
|
||||
|
||||
* LUCENE-3233: Improved memory usage, build time, and performance of
|
||||
SynonymFilterFactory. (Mike McCandless, Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.text.ParseException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
|
||||
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
/**
|
||||
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. this is only a backwards compatibility
|
||||
* mechanism that will be removed in Lucene 5.0
|
||||
*/
|
||||
// NOTE: rename this to "SynonymFilterFactory" and nuke that delegator in Lucene 5.0!
|
||||
@Deprecated
|
||||
final class FSTSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
private SynonymMap map;
|
||||
private boolean ignoreCase;
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new SynonymFilter(input, map, ignoreCase);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) {
|
||||
final boolean ignoreCase = getBoolean("ignoreCase", false);
|
||||
this.ignoreCase = ignoreCase;
|
||||
|
||||
String tf = args.get("tokenizerFactory");
|
||||
|
||||
final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf, args);
|
||||
|
||||
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader);
|
||||
TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer;
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
|
||||
String format = args.get("format");
|
||||
try {
|
||||
if (format == null || format.equals("solr")) {
|
||||
// TODO: expose dedup as a parameter?
|
||||
map = loadSolrSynonyms(loader, true, analyzer);
|
||||
} else if (format.equals("wordnet")) {
|
||||
map = loadWordnetSynonyms(loader, true, analyzer);
|
||||
} else {
|
||||
// TODO: somehow make this more pluggable
|
||||
throw new RuntimeException("Unrecognized synonyms format: " + format);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load synonyms from the solr format, "format=solr".
|
||||
*/
|
||||
private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
|
||||
final boolean expand = getBoolean("expand", true);
|
||||
String synonyms = args.get("synonyms");
|
||||
if (synonyms == null)
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
|
||||
|
||||
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
|
||||
SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer);
|
||||
File synonymFile = new File(synonyms);
|
||||
if (synonymFile.exists()) {
|
||||
decoder.reset();
|
||||
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(synonyms);
|
||||
for (String file : files) {
|
||||
decoder.reset();
|
||||
parser.add(new InputStreamReader(loader.openResource(file), decoder));
|
||||
}
|
||||
}
|
||||
return parser.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Load synonyms from the wordnet format, "format=wordnet".
|
||||
*/
|
||||
private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
|
||||
final boolean expand = getBoolean("expand", true);
|
||||
String synonyms = args.get("synonyms");
|
||||
if (synonyms == null)
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
|
||||
|
||||
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
|
||||
WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer);
|
||||
File synonymFile = new File(synonyms);
|
||||
if (synonymFile.exists()) {
|
||||
decoder.reset();
|
||||
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(synonyms);
|
||||
for (String file : files) {
|
||||
decoder.reset();
|
||||
parser.add(new InputStreamReader(loader.openResource(file), decoder));
|
||||
}
|
||||
}
|
||||
return parser.build();
|
||||
}
|
||||
|
||||
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
|
||||
TokenizerFactory tokFactory = (TokenizerFactory) loader.newInstance(cname);
|
||||
tokFactory.init(args);
|
||||
return tokFactory;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,261 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
|
||||
* <p>
|
||||
* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
|
||||
* or discarded. If the original tokens are included, the position increments may be modified
|
||||
* to retain absolute positions after merging with the synonym tokenstream.
|
||||
* <p>
|
||||
* Generated synonyms will start at the same position as the first matched source token.
|
||||
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
final class SlowSynonymFilter extends TokenFilter {
|
||||
|
||||
private final SlowSynonymMap map; // Map<String, SynonymMap>
|
||||
private Iterator<AttributeSource> replacement; // iterator over generated tokens
|
||||
|
||||
public SlowSynonymFilter(TokenStream in, SlowSynonymMap map) {
|
||||
super(in);
|
||||
if (map == null)
|
||||
throw new IllegalArgumentException("map is required");
|
||||
|
||||
this.map = map;
|
||||
// just ensuring these attributes exist...
|
||||
addAttribute(CharTermAttribute.class);
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
addAttribute(OffsetAttribute.class);
|
||||
addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Need to worry about multiple scenarios:
|
||||
* - need to go for the longest match
|
||||
* a b => foo #shouldn't match if "a b" is followed by "c d"
|
||||
* a b c d => bar
|
||||
* - need to backtrack - retry matches for tokens already read
|
||||
* a b c d => foo
|
||||
* b c => bar
|
||||
* If the input stream is "a b c x", one will consume "a b c d"
|
||||
* trying to match the first rule... all but "a" should be
|
||||
* pushed back so a match may be made on "b c".
|
||||
* - don't try and match generated tokens (thus need separate queue)
|
||||
* matching is not recursive.
|
||||
* - handle optional generation of original tokens in all these cases,
|
||||
* merging token streams to preserve token positions.
|
||||
* - preserve original positionIncrement of first matched token
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
// if there are any generated tokens, return them... don't try any
|
||||
// matches against them, as we specifically don't want recursion.
|
||||
if (replacement!=null && replacement.hasNext()) {
|
||||
copy(this, replacement.next());
|
||||
return true;
|
||||
}
|
||||
|
||||
// common case fast-path of first token not matching anything
|
||||
AttributeSource firstTok = nextTok();
|
||||
if (firstTok == null) return false;
|
||||
CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
|
||||
SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
|
||||
if (result == null) {
|
||||
copy(this, firstTok);
|
||||
return true;
|
||||
}
|
||||
|
||||
// fast-path failed, clone ourselves if needed
|
||||
if (firstTok == this)
|
||||
firstTok = cloneAttributes();
|
||||
// OK, we matched a token, so find the longest match.
|
||||
|
||||
matched = new LinkedList<AttributeSource>();
|
||||
|
||||
result = match(result);
|
||||
|
||||
if (result==null) {
|
||||
// no match, simply return the first token read.
|
||||
copy(this, firstTok);
|
||||
return true;
|
||||
}
|
||||
|
||||
// reuse, or create new one each time?
|
||||
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
|
||||
|
||||
//
|
||||
// there was a match... let's generate the new tokens, merging
|
||||
// in the matched tokens (position increments need adjusting)
|
||||
//
|
||||
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
|
||||
boolean includeOrig = result.includeOrig();
|
||||
|
||||
AttributeSource origTok = includeOrig ? firstTok : null;
|
||||
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
|
||||
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
|
||||
int repPos=0; // curr position in replacement token stream
|
||||
int pos=0; // current position in merged token stream
|
||||
|
||||
for (int i=0; i<result.synonyms.length; i++) {
|
||||
Token repTok = result.synonyms[i];
|
||||
AttributeSource newTok = firstTok.cloneAttributes();
|
||||
CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
|
||||
|
||||
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
||||
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
|
||||
repPos += repTok.getPositionIncrement();
|
||||
if (i==0) repPos=origPos; // make position of first token equal to original
|
||||
|
||||
// if necessary, insert original tokens and adjust position increment
|
||||
while (origTok != null && origPos <= repPos) {
|
||||
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc.setPositionIncrement(origPos-pos);
|
||||
generated.add(origTok);
|
||||
pos += origPosInc.getPositionIncrement();
|
||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||
if (origTok != null) {
|
||||
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPos += origPosInc.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
|
||||
newPosIncAtt.setPositionIncrement(repPos - pos);
|
||||
generated.add(newTok);
|
||||
pos += newPosIncAtt.getPositionIncrement();
|
||||
}
|
||||
|
||||
// finish up any leftover original tokens
|
||||
while (origTok!=null) {
|
||||
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPosInc.setPositionIncrement(origPos-pos);
|
||||
generated.add(origTok);
|
||||
pos += origPosInc.getPositionIncrement();
|
||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||
if (origTok != null) {
|
||||
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||
origPos += origPosInc.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
|
||||
// what if we replaced a longer sequence with a shorter one?
|
||||
// a/0 b/5 => foo/0
|
||||
// should I re-create the gap on the next buffered token?
|
||||
|
||||
replacement = generated.iterator();
|
||||
// Now return to the top of the loop to read and return the first
|
||||
// generated token.. The reason this is done is that we may have generated
|
||||
// nothing at all, and may need to continue with more matching logic.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Defer creation of the buffer until the first time it is used to
|
||||
// optimize short fields with no matches.
|
||||
//
|
||||
private LinkedList<AttributeSource> buffer;
|
||||
private LinkedList<AttributeSource> matched;
|
||||
|
||||
private boolean exhausted;
|
||||
|
||||
private AttributeSource nextTok() throws IOException {
|
||||
if (buffer!=null && !buffer.isEmpty()) {
|
||||
return buffer.removeFirst();
|
||||
} else {
|
||||
if (!exhausted && input.incrementToken()) {
|
||||
return this;
|
||||
} else {
|
||||
exhausted = true;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void pushTok(AttributeSource t) {
|
||||
if (buffer==null) buffer=new LinkedList<AttributeSource>();
|
||||
buffer.addFirst(t);
|
||||
}
|
||||
|
||||
private SlowSynonymMap match(SlowSynonymMap map) throws IOException {
|
||||
SlowSynonymMap result = null;
|
||||
|
||||
if (map.submap != null) {
|
||||
AttributeSource tok = nextTok();
|
||||
if (tok != null) {
|
||||
// clone ourselves.
|
||||
if (tok == this)
|
||||
tok = cloneAttributes();
|
||||
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
|
||||
CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
|
||||
SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
|
||||
|
||||
if (subMap != null) {
|
||||
// recurse
|
||||
result = match(subMap);
|
||||
}
|
||||
|
||||
if (result != null) {
|
||||
matched.addFirst(tok);
|
||||
} else {
|
||||
// push back unmatched token
|
||||
pushTok(tok);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if no longer sequence matched, so if this node has synonyms, it's the match.
|
||||
if (result==null && map.synonyms!=null) {
|
||||
result = map;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private void copy(AttributeSource target, AttributeSource source) {
|
||||
if (target != source)
|
||||
source.copyTo(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
replacement = null;
|
||||
exhausted = false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,188 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link SlowSynonymFilter} (only used with luceneMatchVersion < 3.4)
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
|
||||
* expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
final class SlowSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String synonyms = args.get("synonyms");
|
||||
if (synonyms == null)
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
|
||||
boolean ignoreCase = getBoolean("ignoreCase", false);
|
||||
boolean expand = getBoolean("expand", true);
|
||||
|
||||
String tf = args.get("tokenizerFactory");
|
||||
TokenizerFactory tokFactory = null;
|
||||
if( tf != null ){
|
||||
tokFactory = loadTokenizerFactory( loader, tf, args );
|
||||
}
|
||||
|
||||
Iterable<String> wlist=loadRules( synonyms, loader );
|
||||
|
||||
synMap = new SlowSynonymMap(ignoreCase);
|
||||
parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a list of all rules
|
||||
*/
|
||||
protected Iterable<String> loadRules( String synonyms, ResourceLoader loader ) {
|
||||
List<String> wlist=null;
|
||||
try {
|
||||
File synonymFile = new File(synonyms);
|
||||
if (synonymFile.exists()) {
|
||||
wlist = loader.getLines(synonyms);
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(synonyms);
|
||||
wlist = new ArrayList<String>();
|
||||
for (String file : files) {
|
||||
List<String> lines = loader.getLines(file.trim());
|
||||
wlist.addAll(lines);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return wlist;
|
||||
}
|
||||
|
||||
private SlowSynonymMap synMap;
|
||||
|
||||
static void parseRules(Iterable<String> rules, SlowSynonymMap map, String mappingSep,
|
||||
String synSep, boolean expansion, TokenizerFactory tokFactory) {
|
||||
int count=0;
|
||||
for (String rule : rules) {
|
||||
// To use regexes, we need an expression that specifies an odd number of chars.
|
||||
// This can't really be done with string.split(), and since we need to
|
||||
// do unescaping at some point anyway, we wouldn't be saving any effort
|
||||
// by using regexes.
|
||||
|
||||
List<String> mapping = StrUtils.splitSmart(rule, mappingSep, false);
|
||||
|
||||
List<List<String>> source;
|
||||
List<List<String>> target;
|
||||
|
||||
if (mapping.size() > 2) {
|
||||
throw new RuntimeException("Invalid Synonym Rule:" + rule);
|
||||
} else if (mapping.size()==2) {
|
||||
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||
target = getSynList(mapping.get(1), synSep, tokFactory);
|
||||
} else {
|
||||
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||
if (expansion) {
|
||||
// expand to all arguments
|
||||
target = source;
|
||||
} else {
|
||||
// reduce to first argument
|
||||
target = new ArrayList<List<String>>(1);
|
||||
target.add(source.get(0));
|
||||
}
|
||||
}
|
||||
|
||||
boolean includeOrig=false;
|
||||
for (List<String> fromToks : source) {
|
||||
count++;
|
||||
for (List<String> toToks : target) {
|
||||
map.add(fromToks,
|
||||
SlowSynonymMap.makeTokens(toToks),
|
||||
includeOrig,
|
||||
true
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// a , b c , d e f => [[a],[b,c],[d,e,f]]
|
||||
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
|
||||
List<String> strList = StrUtils.splitSmart(str, separator, false);
|
||||
// now split on whitespace to get a list of token strings
|
||||
List<List<String>> synList = new ArrayList<List<String>>();
|
||||
for (String toks : strList) {
|
||||
List<String> tokList = tokFactory == null ?
|
||||
StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
|
||||
synList.add(tokList);
|
||||
}
|
||||
return synList;
|
||||
}
|
||||
|
||||
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
|
||||
StringReader reader = new StringReader( source );
|
||||
TokenStream ts = loadTokenizer(tokFactory, reader);
|
||||
List<String> tokList = new ArrayList<String>();
|
||||
try {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
while (ts.incrementToken()){
|
||||
if( termAtt.length() > 0 )
|
||||
tokList.add( termAtt.toString() );
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
finally{
|
||||
reader.close();
|
||||
}
|
||||
return tokList;
|
||||
}
|
||||
|
||||
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
|
||||
TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
|
||||
tokFactory.init( args );
|
||||
return tokFactory;
|
||||
}
|
||||
|
||||
private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
|
||||
return tokFactory.create( reader );
|
||||
}
|
||||
|
||||
public SlowSynonymMap getSynonymMap() {
|
||||
return synMap;
|
||||
}
|
||||
|
||||
public SlowSynonymFilter create(TokenStream input) {
|
||||
return new SlowSynonymFilter(input,synMap);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,162 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/** Mapping rules for use with {@link SlowSynonymFilter}
|
||||
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
class SlowSynonymMap {
|
||||
/** @lucene.internal */
|
||||
public CharArrayMap<SlowSynonymMap> submap; // recursive: Map<String, SynonymMap>
|
||||
/** @lucene.internal */
|
||||
public Token[] synonyms;
|
||||
int flags;
|
||||
|
||||
static final int INCLUDE_ORIG=0x01;
|
||||
static final int IGNORE_CASE=0x02;
|
||||
|
||||
public SlowSynonymMap() {}
|
||||
public SlowSynonymMap(boolean ignoreCase) {
|
||||
if (ignoreCase) flags |= IGNORE_CASE;
|
||||
}
|
||||
|
||||
public boolean includeOrig() { return (flags & INCLUDE_ORIG) != 0; }
|
||||
public boolean ignoreCase() { return (flags & IGNORE_CASE) != 0; }
|
||||
|
||||
/**
|
||||
* @param singleMatch List<String>, the sequence of strings to match
|
||||
* @param replacement List<Token> the list of tokens to use on a match
|
||||
* @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
|
||||
* @param mergeExisting merge the replacement tokens with any other mappings that exist
|
||||
*/
|
||||
public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
|
||||
SlowSynonymMap currMap = this;
|
||||
for (String str : singleMatch) {
|
||||
if (currMap.submap==null) {
|
||||
// for now hardcode at 4.0, as its what the old code did.
|
||||
// would be nice to fix, but shouldn't store a version in each submap!!!
|
||||
currMap.submap = new CharArrayMap<SlowSynonymMap>(Version.LUCENE_40, 1, ignoreCase());
|
||||
}
|
||||
|
||||
SlowSynonymMap map = currMap.submap.get(str);
|
||||
if (map==null) {
|
||||
map = new SlowSynonymMap();
|
||||
map.flags |= flags & IGNORE_CASE;
|
||||
currMap.submap.put(str, map);
|
||||
}
|
||||
|
||||
currMap = map;
|
||||
}
|
||||
|
||||
if (currMap.synonyms != null && !mergeExisting) {
|
||||
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
|
||||
}
|
||||
List<Token> superset = currMap.synonyms==null ? replacement :
|
||||
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
|
||||
currMap.synonyms = superset.toArray(new Token[superset.size()]);
|
||||
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder("<");
|
||||
if (synonyms!=null) {
|
||||
sb.append("[");
|
||||
for (int i=0; i<synonyms.length; i++) {
|
||||
if (i!=0) sb.append(',');
|
||||
sb.append(synonyms[i]);
|
||||
}
|
||||
if ((flags & INCLUDE_ORIG)!=0) {
|
||||
sb.append(",ORIG");
|
||||
}
|
||||
sb.append("],");
|
||||
}
|
||||
sb.append(submap);
|
||||
sb.append(">");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** Produces a List<Token> from a List<String> */
|
||||
public static List<Token> makeTokens(List<String> strings) {
|
||||
List<Token> ret = new ArrayList<Token>(strings.size());
|
||||
for (String str : strings) {
|
||||
//Token newTok = new Token(str,0,0,"SYNONYM");
|
||||
Token newTok = new Token(str, 0,0,"SYNONYM");
|
||||
ret.add(newTok);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
|
||||
* the tokens end up at the same position.
|
||||
*
|
||||
* Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
|
||||
* Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
|
||||
*
|
||||
*/
|
||||
public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
|
||||
ArrayList<Token> result = new ArrayList<Token>();
|
||||
if (lst1 ==null || lst2 ==null) {
|
||||
if (lst2 != null) result.addAll(lst2);
|
||||
if (lst1 != null) result.addAll(lst1);
|
||||
return result;
|
||||
}
|
||||
|
||||
int pos=0;
|
||||
Iterator<Token> iter1=lst1.iterator();
|
||||
Iterator<Token> iter2=lst2.iterator();
|
||||
Token tok1 = iter1.hasNext() ? iter1.next() : null;
|
||||
Token tok2 = iter2.hasNext() ? iter2.next() : null;
|
||||
int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0;
|
||||
int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0;
|
||||
while(tok1!=null || tok2!=null) {
|
||||
while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
|
||||
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
|
||||
tok.copyBuffer(tok1.buffer(), 0, tok1.length());
|
||||
tok.setPositionIncrement(pos1-pos);
|
||||
result.add(tok);
|
||||
pos=pos1;
|
||||
tok1 = iter1.hasNext() ? iter1.next() : null;
|
||||
pos1 += tok1!=null ? tok1.getPositionIncrement() : 0;
|
||||
}
|
||||
while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
|
||||
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
|
||||
tok.copyBuffer(tok2.buffer(), 0, tok2.length());
|
||||
tok.setPositionIncrement(pos2-pos);
|
||||
result.add(tok);
|
||||
pos=pos2;
|
||||
tok2 = iter2.hasNext() ? iter2.next() : null;
|
||||
pos2 += tok2!=null ? tok2.getPositionIncrement() : 0;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,189 +1,54 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link SynonymFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
|
||||
* expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
|
||||
* format="solr" ignoreCase="false" expand="true"
|
||||
* tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
*/
|
||||
public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
private BaseTokenFilterFactory delegator;
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
if (luceneMatchVersion.onOrAfter(Version.LUCENE_34)) {
|
||||
delegator = new FSTSynonymFilterFactory();
|
||||
} else {
|
||||
// check if you use the new optional arg "format". this makes no sense for the old one,
|
||||
// as its wired to solr's synonyms format only.
|
||||
if (args.containsKey("format") && !args.get("format").equals("solr")) {
|
||||
throw new IllegalArgumentException("You must specify luceneMatchVersion >= 3.4 to use alternate synonyms formats");
|
||||
}
|
||||
delegator = new SlowSynonymFilterFactory();
|
||||
}
|
||||
delegator.init(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
assert delegator != null : "init() was not called!";
|
||||
return delegator.create(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) {
|
||||
String synonyms = args.get("synonyms");
|
||||
if (synonyms == null)
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
|
||||
boolean ignoreCase = getBoolean("ignoreCase", false);
|
||||
boolean expand = getBoolean("expand", true);
|
||||
|
||||
String tf = args.get("tokenizerFactory");
|
||||
TokenizerFactory tokFactory = null;
|
||||
if( tf != null ){
|
||||
tokFactory = loadTokenizerFactory( loader, tf, args );
|
||||
}
|
||||
|
||||
Iterable<String> wlist=loadRules( synonyms, loader );
|
||||
|
||||
synMap = new SynonymMap(ignoreCase);
|
||||
parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a list of all rules
|
||||
*/
|
||||
protected Iterable<String> loadRules( String synonyms, ResourceLoader loader ) {
|
||||
List<String> wlist=null;
|
||||
try {
|
||||
File synonymFile = new File(synonyms);
|
||||
if (synonymFile.exists()) {
|
||||
wlist = loader.getLines(synonyms);
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(synonyms);
|
||||
wlist = new ArrayList<String>();
|
||||
for (String file : files) {
|
||||
List<String> lines = loader.getLines(file.trim());
|
||||
wlist.addAll(lines);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return wlist;
|
||||
}
|
||||
|
||||
private SynonymMap synMap;
|
||||
|
||||
static void parseRules(Iterable<String> rules, SynonymMap map, String mappingSep,
|
||||
String synSep, boolean expansion, TokenizerFactory tokFactory) {
|
||||
int count=0;
|
||||
for (String rule : rules) {
|
||||
// To use regexes, we need an expression that specifies an odd number of chars.
|
||||
// This can't really be done with string.split(), and since we need to
|
||||
// do unescaping at some point anyway, we wouldn't be saving any effort
|
||||
// by using regexes.
|
||||
|
||||
List<String> mapping = StrUtils.splitSmart(rule, mappingSep, false);
|
||||
|
||||
List<List<String>> source;
|
||||
List<List<String>> target;
|
||||
|
||||
if (mapping.size() > 2) {
|
||||
throw new RuntimeException("Invalid Synonym Rule:" + rule);
|
||||
} else if (mapping.size()==2) {
|
||||
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||
target = getSynList(mapping.get(1), synSep, tokFactory);
|
||||
} else {
|
||||
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||
if (expansion) {
|
||||
// expand to all arguments
|
||||
target = source;
|
||||
} else {
|
||||
// reduce to first argument
|
||||
target = new ArrayList<List<String>>(1);
|
||||
target.add(source.get(0));
|
||||
}
|
||||
}
|
||||
|
||||
boolean includeOrig=false;
|
||||
for (List<String> fromToks : source) {
|
||||
count++;
|
||||
for (List<String> toToks : target) {
|
||||
map.add(fromToks,
|
||||
SynonymMap.makeTokens(toToks),
|
||||
includeOrig,
|
||||
true
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// a , b c , d e f => [[a],[b,c],[d,e,f]]
|
||||
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
|
||||
List<String> strList = StrUtils.splitSmart(str, separator, false);
|
||||
// now split on whitespace to get a list of token strings
|
||||
List<List<String>> synList = new ArrayList<List<String>>();
|
||||
for (String toks : strList) {
|
||||
List<String> tokList = tokFactory == null ?
|
||||
StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
|
||||
synList.add(tokList);
|
||||
}
|
||||
return synList;
|
||||
}
|
||||
|
||||
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
|
||||
StringReader reader = new StringReader( source );
|
||||
TokenStream ts = loadTokenizer(tokFactory, reader);
|
||||
List<String> tokList = new ArrayList<String>();
|
||||
try {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
while (ts.incrementToken()){
|
||||
if( termAtt.length() > 0 )
|
||||
tokList.add( termAtt.toString() );
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
finally{
|
||||
reader.close();
|
||||
}
|
||||
return tokList;
|
||||
}
|
||||
|
||||
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
|
||||
TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
|
||||
tokFactory.init( args );
|
||||
return tokFactory;
|
||||
}
|
||||
|
||||
private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
|
||||
return tokFactory.create( reader );
|
||||
}
|
||||
|
||||
public SynonymMap getSynonymMap() {
|
||||
return synMap;
|
||||
}
|
||||
|
||||
public SynonymFilter create(TokenStream input) {
|
||||
return new SynonymFilter(input,synMap);
|
||||
assert delegator != null : "init() was not called!";
|
||||
((ResourceLoaderAware) delegator).inform(loader);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,30 +17,69 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.junit.Test;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @since solr 1.4
|
||||
*/
|
||||
public class TestMultiWordSynonyms extends BaseTokenTestCase {
|
||||
|
||||
@Test
|
||||
public void testMultiWordSynonyms() throws IOException {
|
||||
/**
|
||||
* @deprecated Remove this test in 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testMultiWordSynonymsOld() throws IOException {
|
||||
List<String> rules = new ArrayList<String>();
|
||||
rules.add("a b c,d");
|
||||
SynonymMap synMap = new SynonymMap(true);
|
||||
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
|
||||
SlowSynonymMap synMap = new SlowSynonymMap(true);
|
||||
SlowSynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
|
||||
|
||||
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
|
||||
SlowSynonymFilter ts = new SlowSynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
|
||||
// This fails because ["e","e"] is the value of the token stream
|
||||
assertTokenStreamContents(ts, new String[] { "a", "e" });
|
||||
}
|
||||
|
||||
public void testMultiWordSynonyms() throws IOException {
|
||||
SynonymFilterFactory factory = new SynonymFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.putAll(DEFAULT_VERSION_PARAM);
|
||||
args.put("synonyms", "synonyms.txt");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader("a b c,d"));
|
||||
TokenStream ts = factory.create(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false));
|
||||
// This fails because ["e","e"] is the value of the token stream
|
||||
assertTokenStreamContents(ts, new String[] { "a", "e" });
|
||||
}
|
||||
|
||||
private class StringMockSolrResourceLoader implements ResourceLoader {
|
||||
String text;
|
||||
|
||||
StringMockSolrResourceLoader(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
public Object newInstance(String cname, String... subpackages) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public InputStream openResource(String resource) throws IOException {
|
||||
return new ByteArrayInputStream(text.getBytes("UTF-8"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.synonym;
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
@ -29,51 +29,52 @@ import org.apache.lucene.analysis.MockTokenizer;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
|
||||
/**
|
||||
* @deprecated Remove this test in Lucene 5.0
|
||||
*/
|
||||
public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
||||
@Deprecated
|
||||
public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
static List<String> strings(String str) {
|
||||
String[] arr = str.split(" ");
|
||||
return Arrays.asList(arr);
|
||||
}
|
||||
|
||||
static void assertTokenizesTo(SynonymMap dict, String input,
|
||||
static void assertTokenizesTo(SlowSynonymMap dict, String input,
|
||||
String expected[]) throws IOException {
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
||||
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
|
||||
assertTokenStreamContents(stream, expected);
|
||||
}
|
||||
|
||||
static void assertTokenizesTo(SynonymMap dict, String input,
|
||||
static void assertTokenizesTo(SlowSynonymMap dict, String input,
|
||||
String expected[], int posIncs[]) throws IOException {
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
||||
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
|
||||
assertTokenStreamContents(stream, expected, posIncs);
|
||||
}
|
||||
|
||||
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
|
||||
static void assertTokenizesTo(SlowSynonymMap dict, List<Token> input,
|
||||
String expected[], int posIncs[])
|
||||
throws IOException {
|
||||
TokenStream tokenizer = new IterTokenStream(input);
|
||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
||||
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
|
||||
assertTokenStreamContents(stream, expected, posIncs);
|
||||
}
|
||||
|
||||
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
|
||||
static void assertTokenizesTo(SlowSynonymMap dict, List<Token> input,
|
||||
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
|
||||
throws IOException {
|
||||
TokenStream tokenizer = new IterTokenStream(input);
|
||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
||||
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
|
||||
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
|
||||
posIncs);
|
||||
}
|
||||
|
||||
public void testMatching() throws IOException {
|
||||
SynonymMap map = new SynonymMap();
|
||||
SlowSynonymMap map = new SlowSynonymMap();
|
||||
|
||||
boolean orig = false;
|
||||
boolean merge = true;
|
||||
|
@ -110,7 +111,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testIncludeOrig() throws IOException {
|
||||
SynonymMap map = new SynonymMap();
|
||||
SlowSynonymMap map = new SlowSynonymMap();
|
||||
|
||||
boolean orig = true;
|
||||
boolean merge = true;
|
||||
|
@ -167,7 +168,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
|
||||
public void testMapMerge() throws IOException {
|
||||
SynonymMap map = new SynonymMap();
|
||||
SlowSynonymMap map = new SlowSynonymMap();
|
||||
|
||||
boolean orig = false;
|
||||
boolean merge = true;
|
||||
|
@ -206,7 +207,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
|
||||
public void testOverlap() throws IOException {
|
||||
SynonymMap map = new SynonymMap();
|
||||
SlowSynonymMap map = new SlowSynonymMap();
|
||||
|
||||
boolean orig = false;
|
||||
boolean merge = true;
|
||||
|
@ -229,7 +230,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testPositionIncrements() throws IOException {
|
||||
SynonymMap map = new SynonymMap();
|
||||
SlowSynonymMap map = new SlowSynonymMap();
|
||||
|
||||
boolean orig = false;
|
||||
boolean merge = true;
|
||||
|
@ -264,7 +265,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
|
||||
public void testPositionIncrementsWithOrig() throws IOException {
|
||||
SynonymMap map = new SynonymMap();
|
||||
SlowSynonymMap map = new SlowSynonymMap();
|
||||
|
||||
boolean orig = true;
|
||||
boolean merge = true;
|
||||
|
@ -304,7 +305,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
|||
// x=>y
|
||||
// analysing "a x" causes "y" to have a bad offset (end less than start)
|
||||
// SOLR-167
|
||||
SynonymMap map = new SynonymMap();
|
||||
SlowSynonymMap map = new SlowSynonymMap();
|
||||
|
||||
boolean orig = false;
|
||||
boolean merge = true;
|
|
@ -0,0 +1,62 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
public class TestSynonymFilterFactory extends BaseTokenTestCase {
|
||||
/** test that we can parse and use the solr syn file */
|
||||
public void testSynonyms() throws Exception {
|
||||
SynonymFilterFactory factory = new SynonymFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.putAll(DEFAULT_VERSION_PARAM);
|
||||
args.put("synonyms", "synonyms.txt");
|
||||
factory.init(args);
|
||||
factory.inform(new SolrResourceLoader(null, null));
|
||||
TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
|
||||
assertTrue(ts instanceof SynonymFilter);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "GB", "gib", "gigabyte", "gigabytes" },
|
||||
new int[] { 1, 0, 0, 0 });
|
||||
}
|
||||
|
||||
/** test that we can parse and use the solr syn file, with the old impl
|
||||
* @deprecated Remove this test in Lucene 5.0 */
|
||||
@Deprecated
|
||||
public void testSynonymsOld() throws Exception {
|
||||
SynonymFilterFactory factory = new SynonymFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("luceneMatchVersion", Version.LUCENE_33.toString());
|
||||
args.put("synonyms", "synonyms.txt");
|
||||
factory.init(args);
|
||||
factory.inform(new SolrResourceLoader(null, null));
|
||||
TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
|
||||
assertTrue(ts instanceof SlowSynonymFilter);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "GB", "gib", "gigabyte", "gigabytes" },
|
||||
new int[] { 1, 0, 0, 0 });
|
||||
}
|
||||
}
|
|
@ -25,32 +25,35 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
|
||||
/**
|
||||
* @deprecated Remove this test in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public class TestSynonymMap extends LuceneTestCase {
|
||||
|
||||
public void testInvalidMappingRules() throws Exception {
|
||||
SynonymMap synMap = new SynonymMap( true );
|
||||
SlowSynonymMap synMap = new SlowSynonymMap( true );
|
||||
List<String> rules = new ArrayList<String>( 1 );
|
||||
rules.add( "a=>b=>c" );
|
||||
try{
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
fail( "RuntimeException must be thrown." );
|
||||
}
|
||||
catch( RuntimeException expected ){}
|
||||
}
|
||||
|
||||
public void testReadMappingRules() throws Exception {
|
||||
SynonymMap synMap;
|
||||
SlowSynonymMap synMap;
|
||||
|
||||
// (a)->[b]
|
||||
List<String> rules = new ArrayList<String>();
|
||||
rules.add( "a=>b" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 1, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "b" );
|
||||
|
||||
|
@ -58,8 +61,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
// (b)->[c]
|
||||
rules.clear();
|
||||
rules.add( "a,b=>c" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "c" );
|
||||
assertTokIncludes( synMap, "b", "c" );
|
||||
|
@ -67,8 +70,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
// (a)->[b][c]
|
||||
rules.clear();
|
||||
rules.add( "a=>b,c" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 1, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "b" );
|
||||
assertTokIncludes( synMap, "a", "c" );
|
||||
|
@ -78,8 +81,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
rules.clear();
|
||||
rules.add( "a=>a1" );
|
||||
rules.add( "a b=>a2" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 1, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a1" );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
|
||||
|
@ -92,8 +95,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
rules.add( "a=>a1" );
|
||||
rules.add( "a b=>a2" );
|
||||
rules.add( "a c=>a3" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 1, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a1" );
|
||||
assertEquals( 2, getSubSynonymMap( synMap, "a" ).submap.size() );
|
||||
|
@ -109,8 +112,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
rules.add( "a b=>a2" );
|
||||
rules.add( "b=>b1" );
|
||||
rules.add( "b c=>b2" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a1" );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
|
||||
|
@ -121,14 +124,14 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testRead1waySynonymRules() throws Exception {
|
||||
SynonymMap synMap;
|
||||
SlowSynonymMap synMap;
|
||||
|
||||
// (a)->[a]
|
||||
// (b)->[a]
|
||||
List<String> rules = new ArrayList<String>();
|
||||
rules.add( "a,b" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "b", "a" );
|
||||
|
@ -138,8 +141,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
// (c)->[a]
|
||||
rules.clear();
|
||||
rules.add( "a,b,c" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
assertEquals( 3, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "b", "a" );
|
||||
|
@ -149,8 +152,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
// (b1)->(b2)->[a]
|
||||
rules.clear();
|
||||
rules.add( "a,b1 b2" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() );
|
||||
|
@ -160,8 +163,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
// (b)->[a1][a2]
|
||||
rules.clear();
|
||||
rules.add( "a1 a2,b" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
|
||||
|
@ -171,14 +174,14 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testRead2waySynonymRules() throws Exception {
|
||||
SynonymMap synMap;
|
||||
SlowSynonymMap synMap;
|
||||
|
||||
// (a)->[a][b]
|
||||
// (b)->[a][b]
|
||||
List<String> rules = new ArrayList<String>();
|
||||
rules.add( "a,b" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "a", "b" );
|
||||
|
@ -190,8 +193,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
// (c)->[a][b][c]
|
||||
rules.clear();
|
||||
rules.add( "a,b,c" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 3, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "a", "b" );
|
||||
|
@ -209,8 +212,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
// [b1][b2]
|
||||
rules.clear();
|
||||
rules.add( "a,b1 b2" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "a", "b1" );
|
||||
|
@ -226,8 +229,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
// [b]
|
||||
rules.clear();
|
||||
rules.add( "a1 a2,b" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
|
||||
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
|
||||
|
@ -239,7 +242,7 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testBigramTokenizer() throws Exception {
|
||||
SynonymMap synMap;
|
||||
SlowSynonymMap synMap;
|
||||
|
||||
// prepare bi-gram tokenizer factory
|
||||
BaseTokenizerFactory tf = new NGramTokenizerFactory();
|
||||
|
@ -251,8 +254,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
// (ab)->(bc)->(cd)->[ef][fg][gh]
|
||||
List<String> rules = new ArrayList<String>();
|
||||
rules.add( "abcd=>efgh" );
|
||||
synMap = new SynonymMap( true );
|
||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
|
||||
synMap = new SlowSynonymMap( true );
|
||||
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
|
||||
assertEquals( 1, synMap.submap.size() );
|
||||
assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
|
||||
assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
|
||||
|
@ -265,7 +268,7 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
public void testLoadRules() throws Exception {
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put( "synonyms", "something.txt" );
|
||||
SynonymFilterFactory ff = new SynonymFilterFactory();
|
||||
SlowSynonymFilterFactory ff = new SlowSynonymFilterFactory();
|
||||
ff.init(args);
|
||||
ff.inform( new ResourceLoader() {
|
||||
@Override
|
||||
|
@ -289,7 +292,7 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
}
|
||||
});
|
||||
|
||||
SynonymMap synMap = ff.getSynonymMap();
|
||||
SlowSynonymMap synMap = ff.getSynonymMap();
|
||||
assertEquals( 2, synMap.submap.size() );
|
||||
assertTokIncludes( synMap, "a", "a" );
|
||||
assertTokIncludes( synMap, "a", "b" );
|
||||
|
@ -298,7 +301,7 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
}
|
||||
|
||||
|
||||
private void assertTokIncludes( SynonymMap map, String src, String exp ) throws Exception {
|
||||
private void assertTokIncludes( SlowSynonymMap map, String src, String exp ) throws Exception {
|
||||
Token[] tokens = map.submap.get( src ).synonyms;
|
||||
boolean inc = false;
|
||||
for( Token token : tokens ){
|
||||
|
@ -308,7 +311,7 @@ public class TestSynonymMap extends LuceneTestCase {
|
|||
assertTrue( inc );
|
||||
}
|
||||
|
||||
private SynonymMap getSubSynonymMap( SynonymMap map, String src ){
|
||||
private SlowSynonymMap getSubSynonymMap( SlowSynonymMap map, String src ){
|
||||
return map.submap.get( src );
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue