mirror of https://github.com/apache/lucene.git
LUCENE-3233: improve ram/perf of SynonymFilter, add wordnet parsing, nuke contrib/wordnet
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145158 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
19fd2508c6
commit
015ecfa0a0
|
@ -230,7 +230,6 @@
|
||||||
<packageset dir="contrib/misc/src/java"/>
|
<packageset dir="contrib/misc/src/java"/>
|
||||||
<packageset dir="contrib/queries/src/java"/>
|
<packageset dir="contrib/queries/src/java"/>
|
||||||
<packageset dir="contrib/spatial/src/java"/>
|
<packageset dir="contrib/spatial/src/java"/>
|
||||||
<packageset dir="contrib/wordnet/src/java"/>
|
|
||||||
<packageset dir="contrib/xml-query-parser/src/java"/>
|
<packageset dir="contrib/xml-query-parser/src/java"/>
|
||||||
<packageset dir="contrib/queryparser/src/java"/>
|
<packageset dir="contrib/queryparser/src/java"/>
|
||||||
<!-- end alpha sort -->
|
<!-- end alpha sort -->
|
||||||
|
@ -250,7 +249,6 @@
|
||||||
<group title="contrib: Queries" packages="org.apache.lucene.search.similar*:org.apache.lucene.search.regex*:org.apache.regexp*"/>
|
<group title="contrib: Queries" packages="org.apache.lucene.search.similar*:org.apache.lucene.search.regex*:org.apache.regexp*"/>
|
||||||
<group title="contrib: Query Parser" packages="org.apache.lucene.queryParser.*"/>
|
<group title="contrib: Query Parser" packages="org.apache.lucene.queryParser.*"/>
|
||||||
<group title="contrib: Spatial" packages="org.apache.lucene.spatial*"/>
|
<group title="contrib: Spatial" packages="org.apache.lucene.spatial*"/>
|
||||||
<group title="contrib: WordNet" packages="org.apache.lucene.wordnet*"/>
|
|
||||||
<group title="contrib: XML Query Parser" packages="org.apache.lucene.xmlparser*"/>
|
<group title="contrib: XML Query Parser" packages="org.apache.lucene.xmlparser*"/>
|
||||||
|
|
||||||
</sources>
|
</sources>
|
||||||
|
|
|
@ -5,11 +5,6 @@ http://s.apache.org/luceneversions
|
||||||
|
|
||||||
======================= Trunk (not yet released) =======================
|
======================= Trunk (not yet released) =======================
|
||||||
|
|
||||||
Changes in runtime behavior
|
|
||||||
|
|
||||||
* LUCENE-3250: Wordnet's SynExpand requires a non-null Analyzer (it no longer
|
|
||||||
treats null as StandardAnalyzer). (Robert Muir)
|
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-2845: Moved contrib/benchmark to modules.
|
* LUCENE-2845: Moved contrib/benchmark to modules.
|
||||||
|
@ -78,6 +73,10 @@ New Features
|
||||||
documents must be indexed as a document block, using
|
documents must be indexed as a document block, using
|
||||||
IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless)
|
IndexWriter.add/UpdateDocuments (Mark Harwood, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-3233: Added SynonymFilter for applying multi-word synonyms
|
||||||
|
during indexing or querying (with parsers for wordnet and solr formats).
|
||||||
|
Removed contrib/wordnet. (Robert Muir, Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
As of 2002-11-13 WordNet Lucene contribution contains a single Java class:
|
|
||||||
org.apache.lucene.wordnet.Syns2Index.
|
|
||||||
|
|
||||||
This class creates a Lucene index with synonyms for English words from
|
|
||||||
a Prolog file, which is a part of WordNet database.
|
|
|
@ -1,70 +0,0 @@
|
||||||
<?xml version="1.0"?>
|
|
||||||
|
|
||||||
<!--
|
|
||||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
-->
|
|
||||||
|
|
||||||
<project name="wordnet" default="default">
|
|
||||||
|
|
||||||
<description>
|
|
||||||
WordNet
|
|
||||||
</description>
|
|
||||||
|
|
||||||
<property name="prolog.file" location="prologwn/wn_s.pl"/>
|
|
||||||
<property name="synindex.dir" location="index"/>
|
|
||||||
|
|
||||||
<available property="synindex.exists" file="${synindex.dir}" type="dir"/>
|
|
||||||
|
|
||||||
<import file="../contrib-build.xml"/>
|
|
||||||
|
|
||||||
<target name="index" depends="compile" description="Build WordNet index">
|
|
||||||
<fail if="synindex.exists">
|
|
||||||
Index already exists - must remove first.
|
|
||||||
</fail>
|
|
||||||
|
|
||||||
<java classname="org.apache.lucene.wordnet.Syns2Index">
|
|
||||||
<classpath>
|
|
||||||
<path refid="compile.classpath"/>
|
|
||||||
<pathelement location="${build.dir}/classes"/>
|
|
||||||
</classpath>
|
|
||||||
|
|
||||||
<arg file="${prolog.file}"/>
|
|
||||||
<arg file="${synindex.dir}"/>
|
|
||||||
</java>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
|
|
||||||
<target name="synonym" description="Find synonyms for word">
|
|
||||||
<fail unless="synindex.exists">
|
|
||||||
Index does not exist.
|
|
||||||
</fail>
|
|
||||||
|
|
||||||
<fail unless="word">
|
|
||||||
Must specify 'word' property.
|
|
||||||
</fail>
|
|
||||||
|
|
||||||
<java classname="org.apache.lucene.wordnet.SynLookup">
|
|
||||||
<classpath>
|
|
||||||
<path refid="compile.classpath"/>
|
|
||||||
<pathelement location="${build.dir}/classes"/>
|
|
||||||
</classpath>
|
|
||||||
|
|
||||||
<arg file="${synindex.dir}"/>
|
|
||||||
<arg value="${word}"/>
|
|
||||||
</java>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
</project>
|
|
|
@ -1,142 +0,0 @@
|
||||||
package org.apache.lucene.wordnet;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.StringReader;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.index.Term;
|
|
||||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
|
||||||
import org.apache.lucene.search.BooleanClause;
|
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
|
||||||
import org.apache.lucene.search.Collector;
|
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.search.Scorer;
|
|
||||||
import org.apache.lucene.search.TermQuery;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Expand a query by looking up synonyms for every term.
|
|
||||||
* You need to invoke {@link Syns2Index} first to build the synonym index.
|
|
||||||
*
|
|
||||||
* @see Syns2Index
|
|
||||||
*/
|
|
||||||
public final class SynExpand {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Perform synonym expansion on a query.
|
|
||||||
*
|
|
||||||
* @param query users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser.
|
|
||||||
*
|
|
||||||
* @param syns a opened to the Lucene index you previously created with {@link Syns2Index}. The searcher is not closed or otherwise altered.
|
|
||||||
*
|
|
||||||
* @param a analyzer used to parse the users query.
|
|
||||||
*
|
|
||||||
* @param f optional field name to search in or null if you want the default of "contents"
|
|
||||||
*
|
|
||||||
* @param boost optional boost applied to synonyms else no boost is applied
|
|
||||||
*
|
|
||||||
* @return the expanded Query
|
|
||||||
*/
|
|
||||||
public static Query expand( String query,
|
|
||||||
IndexSearcher syns,
|
|
||||||
Analyzer a,
|
|
||||||
String f,
|
|
||||||
final float boost)
|
|
||||||
throws IOException
|
|
||||||
{
|
|
||||||
final Set<String> already = new HashSet<String>(); // avoid dups
|
|
||||||
List<String> top = new LinkedList<String>(); // needs to be separately listed..
|
|
||||||
final String field = ( f == null) ? "contents" : f;
|
|
||||||
|
|
||||||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
|
||||||
TokenStream ts = a.reusableTokenStream( field, new StringReader( query));
|
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
|
||||||
ts.reset();
|
|
||||||
while (ts.incrementToken()) {
|
|
||||||
String word = termAtt.toString();
|
|
||||||
if ( already.add( word))
|
|
||||||
top.add( word);
|
|
||||||
}
|
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
final BooleanQuery tmp = new BooleanQuery();
|
|
||||||
|
|
||||||
// [2] form query
|
|
||||||
Iterator<String> it = top.iterator();
|
|
||||||
while ( it.hasNext())
|
|
||||||
{
|
|
||||||
// [2a] add to level words in
|
|
||||||
String word = it.next();
|
|
||||||
TermQuery tq = new TermQuery( new Term( field, word));
|
|
||||||
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
|
||||||
|
|
||||||
syns.search(new TermQuery( new Term(Syns2Index.F_WORD, word)), new Collector() {
|
|
||||||
IndexReader reader;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean acceptsDocsOutOfOrder() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void collect(int doc) throws IOException {
|
|
||||||
Document d = reader.document(doc);
|
|
||||||
String[] values = d.getValues( Syns2Index.F_SYN);
|
|
||||||
for ( int j = 0; j < values.length; j++)
|
|
||||||
{
|
|
||||||
String syn = values[ j];
|
|
||||||
if ( already.add( syn)) // avoid dups of top level words and synonyms
|
|
||||||
{
|
|
||||||
TermQuery tq = new TermQuery( new Term( field, syn));
|
|
||||||
if ( boost > 0) // else keep normal 1.0
|
|
||||||
tq.setBoost( boost);
|
|
||||||
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setNextReader(AtomicReaderContext context)
|
|
||||||
throws IOException {
|
|
||||||
this.reader = context.reader;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setScorer(Scorer scorer) throws IOException {}
|
|
||||||
});
|
|
||||||
|
|
||||||
// [2b] add in unique synonums
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
return tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,170 +0,0 @@
|
||||||
package org.apache.lucene.wordnet;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.StringReader;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.index.Term;
|
|
||||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
|
||||||
import org.apache.lucene.search.BooleanClause;
|
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
|
||||||
import org.apache.lucene.search.Collector;
|
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
|
||||||
import org.apache.lucene.search.Scorer;
|
|
||||||
import org.apache.lucene.search.TermQuery;
|
|
||||||
import org.apache.lucene.search.TotalHitCountCollector;
|
|
||||||
import org.apache.lucene.store.FSDirectory;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test program to look up synonyms.
|
|
||||||
*/
|
|
||||||
public class SynLookup {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException {
|
|
||||||
if (args.length != 2) {
|
|
||||||
System.out.println(
|
|
||||||
"java org.apache.lucene.wordnet.SynLookup <index path> <word>");
|
|
||||||
}
|
|
||||||
|
|
||||||
FSDirectory directory = FSDirectory.open(new File(args[0]));
|
|
||||||
IndexSearcher searcher = new IndexSearcher(directory, true);
|
|
||||||
|
|
||||||
String word = args[1];
|
|
||||||
Query query = new TermQuery(new Term(Syns2Index.F_WORD, word));
|
|
||||||
TotalHitCountCollector countingCollector = new TotalHitCountCollector();
|
|
||||||
searcher.search(query, countingCollector);
|
|
||||||
|
|
||||||
if (countingCollector.getTotalHits() == 0) {
|
|
||||||
System.out.println("No synonyms found for " + word);
|
|
||||||
} else {
|
|
||||||
System.out.println("Synonyms found for \"" + word + "\":");
|
|
||||||
}
|
|
||||||
|
|
||||||
ScoreDoc[] hits = searcher.search(query, countingCollector.getTotalHits()).scoreDocs;
|
|
||||||
|
|
||||||
for (int i = 0; i < hits.length; i++) {
|
|
||||||
Document doc = searcher.doc(hits[i].doc);
|
|
||||||
|
|
||||||
String[] values = doc.getValues(Syns2Index.F_SYN);
|
|
||||||
|
|
||||||
for (int j = 0; j < values.length; j++) {
|
|
||||||
System.out.println(values[j]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
searcher.close();
|
|
||||||
directory.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Perform synonym expansion on a query.
|
|
||||||
*
|
|
||||||
* @param query
|
|
||||||
* @param syns
|
|
||||||
* @param a
|
|
||||||
* @param field
|
|
||||||
* @param boost
|
|
||||||
*/
|
|
||||||
public static Query expand( String query,
|
|
||||||
IndexSearcher syns,
|
|
||||||
Analyzer a,
|
|
||||||
final String field,
|
|
||||||
final float boost)
|
|
||||||
throws IOException
|
|
||||||
{
|
|
||||||
final Set<String> already = new HashSet<String>(); // avoid dups
|
|
||||||
List<String> top = new LinkedList<String>(); // needs to be separately listed..
|
|
||||||
|
|
||||||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
|
||||||
TokenStream ts = a.reusableTokenStream( field, new StringReader( query));
|
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
|
||||||
|
|
||||||
while (ts.incrementToken()) {
|
|
||||||
String word = termAtt.toString();
|
|
||||||
if ( already.add( word))
|
|
||||||
top.add( word);
|
|
||||||
}
|
|
||||||
final BooleanQuery tmp = new BooleanQuery();
|
|
||||||
|
|
||||||
// [2] form query
|
|
||||||
Iterator<String> it = top.iterator();
|
|
||||||
while ( it.hasNext())
|
|
||||||
{
|
|
||||||
// [2a] add to level words in
|
|
||||||
String word = it.next();
|
|
||||||
TermQuery tq = new TermQuery( new Term( field, word));
|
|
||||||
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
|
||||||
|
|
||||||
// [2b] add in unique synonums
|
|
||||||
syns.search(new TermQuery( new Term(Syns2Index.F_WORD, word)), new Collector() {
|
|
||||||
IndexReader reader;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean acceptsDocsOutOfOrder() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void collect(int doc) throws IOException {
|
|
||||||
Document d = reader.document(doc);
|
|
||||||
String[] values = d.getValues( Syns2Index.F_SYN);
|
|
||||||
for ( int j = 0; j < values.length; j++)
|
|
||||||
{
|
|
||||||
String syn = values[ j];
|
|
||||||
if ( already.add( syn))
|
|
||||||
{
|
|
||||||
TermQuery tq = new TermQuery( new Term( field, syn));
|
|
||||||
if ( boost > 0) // else keep normal 1.0
|
|
||||||
tq.setBoost( boost);
|
|
||||||
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setNextReader(AtomicReaderContext context)
|
|
||||||
throws IOException {
|
|
||||||
this.reader = context.reader;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setScorer(Scorer scorer) throws IOException {}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
return tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,400 +0,0 @@
|
||||||
package org.apache.lucene.wordnet;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.TreeMap;
|
|
||||||
import java.util.TreeSet;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Loads the <a target="_blank"
|
|
||||||
* href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a
|
|
||||||
* href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a>
|
|
||||||
* into a thread-safe main-memory hash map that can be used for fast
|
|
||||||
* high-frequency lookups of synonyms for any given (lowercase) word string.
|
|
||||||
* <p>
|
|
||||||
* There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
|
|
||||||
* There does not necessarily hold: A -> B, B -> C then A -> C.
|
|
||||||
* <p>
|
|
||||||
* Loading typically takes some 1.5 secs, so should be done only once per
|
|
||||||
* (server) program execution, using a singleton pattern. Once loaded, a
|
|
||||||
* synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
|
|
||||||
* A loaded default synonym map consumes about 10 MB main memory.
|
|
||||||
* An instance is immutable, hence thread-safe.
|
|
||||||
* <p>
|
|
||||||
* This implementation borrows some ideas from the Lucene Syns2Index demo that
|
|
||||||
* Dave Spencer originally contributed to Lucene. Dave's approach
|
|
||||||
* involved a persistent Lucene index which is suitable for occasional
|
|
||||||
* lookups or very large synonym tables, but considered unsuitable for
|
|
||||||
* high-frequency lookups of medium size synonym tables.
|
|
||||||
* <p>
|
|
||||||
* Example Usage:
|
|
||||||
* <pre class="prettyprint">
|
|
||||||
* String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
|
|
||||||
* SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
|
|
||||||
* for (int i = 0; i < words.length; i++) {
|
|
||||||
* String[] synonyms = map.getSynonyms(words[i]);
|
|
||||||
* System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
|
|
||||||
* }
|
|
||||||
* </pre>
|
|
||||||
* <b/>
|
|
||||||
* Example output:
|
|
||||||
* <pre class="prettyprint">
|
|
||||||
* hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
|
|
||||||
* woods:[forest, wood]
|
|
||||||
* forest:[afforest, timber, timberland, wood, woodland, woods]
|
|
||||||
* wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
|
|
||||||
* xxxx:[]
|
|
||||||
* </pre>
|
|
||||||
*
|
|
||||||
* <p>
|
|
||||||
* <b>See also:</b><br>
|
|
||||||
* <a target="_blank"
|
|
||||||
* href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb
|
|
||||||
* man page </a><br>
|
|
||||||
* <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a>
|
|
||||||
*/
|
|
||||||
public class SynonymMap {
|
|
||||||
|
|
||||||
/** the index data; Map<String word, String[] synonyms> */
|
|
||||||
private final HashMap<String,String[]> table;
|
|
||||||
|
|
||||||
private static final String[] EMPTY = new String[0];
|
|
||||||
|
|
||||||
private static final boolean DEBUG = false;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructs an instance, loading WordNet synonym data from the given input
|
|
||||||
* stream. Finally closes the stream. The words in the stream must be in
|
|
||||||
* UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
|
|
||||||
*
|
|
||||||
* @param input
|
|
||||||
* the stream to read from (null indicates an empty synonym map)
|
|
||||||
* @throws IOException
|
|
||||||
* if an error occured while reading the stream.
|
|
||||||
*/
|
|
||||||
public SynonymMap(InputStream input) throws IOException {
|
|
||||||
this.table = input == null ? new HashMap<String,String[]>(0) : read(toByteArray(input));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the synonym set for the given word, sorted ascending.
|
|
||||||
*
|
|
||||||
* @param word
|
|
||||||
* the word to lookup (must be in lowercase).
|
|
||||||
* @return the synonyms; a set of zero or more words, sorted ascending, each
|
|
||||||
* word containing lowercase characters that satisfy
|
|
||||||
* <code>Character.isLetter()</code>.
|
|
||||||
*/
|
|
||||||
public String[] getSynonyms(String word) {
|
|
||||||
String[] synonyms = table.get(word);
|
|
||||||
if (synonyms == null) return EMPTY;
|
|
||||||
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
|
|
||||||
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
|
|
||||||
return copy;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a String representation of the index data for debugging purposes.
|
|
||||||
*
|
|
||||||
* @return a String representation
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
StringBuilder buf = new StringBuilder();
|
|
||||||
Iterator<String> iter = new TreeMap<String,String[]>(table).keySet().iterator();
|
|
||||||
int count = 0;
|
|
||||||
int f0 = 0;
|
|
||||||
int f1 = 0;
|
|
||||||
int f2 = 0;
|
|
||||||
int f3 = 0;
|
|
||||||
|
|
||||||
while (iter.hasNext()) {
|
|
||||||
String word = iter.next();
|
|
||||||
buf.append(word + ":");
|
|
||||||
String[] synonyms = getSynonyms(word);
|
|
||||||
buf.append(Arrays.asList(synonyms));
|
|
||||||
buf.append("\n");
|
|
||||||
count += synonyms.length;
|
|
||||||
if (synonyms.length == 0) f0++;
|
|
||||||
if (synonyms.length == 1) f1++;
|
|
||||||
if (synonyms.length == 2) f2++;
|
|
||||||
if (synonyms.length == 3) f3++;
|
|
||||||
}
|
|
||||||
|
|
||||||
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
|
|
||||||
return buf.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Analyzes/transforms the given word on input stream loading. This default implementation simply
|
|
||||||
* lowercases the word. Override this method with a custom stemming
|
|
||||||
* algorithm or similar, if desired.
|
|
||||||
*
|
|
||||||
* @param word
|
|
||||||
* the word to analyze
|
|
||||||
* @return the same word, or a different word (or null to indicate that the
|
|
||||||
* word should be ignored)
|
|
||||||
*/
|
|
||||||
protected String analyze(String word) {
|
|
||||||
return word.toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected boolean isValid(String str) {
|
|
||||||
for (int i=str.length(); --i >= 0; ) {
|
|
||||||
if (!Character.isLetter(str.charAt(i))) return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private HashMap<String,String[]> read(byte[] data) {
|
|
||||||
int WORDS = (int) (76401 / 0.7); // presizing
|
|
||||||
int GROUPS = (int) (88022 / 0.7); // presizing
|
|
||||||
HashMap<String,ArrayList<Integer>> word2Groups = new HashMap<String,ArrayList<Integer>>(WORDS); // Map<String word, int[] groups>
|
|
||||||
HashMap<Integer,ArrayList<String>> group2Words = new HashMap<Integer,ArrayList<String>>(GROUPS); // Map<int group, String[] words>
|
|
||||||
HashMap<String,String> internedWords = new HashMap<String,String>(WORDS);// Map<String word, String word>
|
|
||||||
|
|
||||||
Charset charset = Charset.forName("UTF-8");
|
|
||||||
int lastNum = -1;
|
|
||||||
Integer lastGroup = null;
|
|
||||||
int len = data.length;
|
|
||||||
int i=0;
|
|
||||||
|
|
||||||
while (i < len) { // until EOF
|
|
||||||
/* Part A: Parse a line */
|
|
||||||
|
|
||||||
// scan to beginning of group
|
|
||||||
while (i < len && data[i] != '(') i++;
|
|
||||||
if (i >= len) break; // EOF
|
|
||||||
i++;
|
|
||||||
|
|
||||||
// parse group
|
|
||||||
int num = 0;
|
|
||||||
while (i < len && data[i] != ',') {
|
|
||||||
num = 10*num + (data[i] - 48);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
// if (DEBUG) System.err.println("num="+ num);
|
|
||||||
|
|
||||||
// scan to beginning of word
|
|
||||||
while (i < len && data[i] != '\'') i++;
|
|
||||||
i++;
|
|
||||||
|
|
||||||
// scan to end of word
|
|
||||||
int start = i;
|
|
||||||
do {
|
|
||||||
while (i < len && data[i] != '\'') i++;
|
|
||||||
i++;
|
|
||||||
} while (i < len && data[i] != ','); // word must end with "',"
|
|
||||||
|
|
||||||
if (i >= len) break; // EOF
|
|
||||||
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
|
|
||||||
// String word = new String(data, 0, start, i-start-1); // ASCII
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Part B: ignore phrases (with spaces and hyphens) and
|
|
||||||
* non-alphabetic words, and let user customize word (e.g. do some
|
|
||||||
* stemming)
|
|
||||||
*/
|
|
||||||
if (!isValid(word)) continue; // ignore
|
|
||||||
word = analyze(word);
|
|
||||||
if (word == null || word.length() == 0) continue; // ignore
|
|
||||||
|
|
||||||
|
|
||||||
/* Part C: Add (group,word) to tables */
|
|
||||||
|
|
||||||
// ensure compact string representation, minimizing memory overhead
|
|
||||||
String w = internedWords.get(word);
|
|
||||||
if (w == null) {
|
|
||||||
word = new String(word); // ensure compact string
|
|
||||||
internedWords.put(word, word);
|
|
||||||
} else {
|
|
||||||
word = w;
|
|
||||||
}
|
|
||||||
|
|
||||||
Integer group = lastGroup;
|
|
||||||
if (num != lastNum) {
|
|
||||||
group = Integer.valueOf(num);
|
|
||||||
lastGroup = group;
|
|
||||||
lastNum = num;
|
|
||||||
}
|
|
||||||
|
|
||||||
// add word --> group
|
|
||||||
ArrayList<Integer> groups = word2Groups.get(word);
|
|
||||||
if (groups == null) {
|
|
||||||
groups = new ArrayList<Integer>(1);
|
|
||||||
word2Groups.put(word, groups);
|
|
||||||
}
|
|
||||||
groups.add(group);
|
|
||||||
|
|
||||||
// add group --> word
|
|
||||||
ArrayList<String> words = group2Words.get(group);
|
|
||||||
if (words == null) {
|
|
||||||
words = new ArrayList<String>(1);
|
|
||||||
group2Words.put(group, words);
|
|
||||||
}
|
|
||||||
words.add(word);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* Part D: compute index data structure */
|
|
||||||
HashMap<String,String[]> word2Syns = createIndex(word2Groups, group2Words);
|
|
||||||
|
|
||||||
/* Part E: minimize memory consumption by a factor 3 (or so) */
|
|
||||||
// if (true) return word2Syns;
|
|
||||||
word2Groups = null; // help gc
|
|
||||||
//TODO: word2Groups.clear(); would be more appropriate ?
|
|
||||||
group2Words = null; // help gc
|
|
||||||
//TODO: group2Words.clear(); would be more appropriate ?
|
|
||||||
|
|
||||||
return optimize(word2Syns, internedWords);
|
|
||||||
}
|
|
||||||
|
|
||||||
private HashMap<String,String[]> createIndex(Map<String,ArrayList<Integer>> word2Groups, Map<Integer,ArrayList<String>> group2Words) {
|
|
||||||
HashMap<String,String[]> word2Syns = new HashMap<String,String[]>();
|
|
||||||
|
|
||||||
for (final Map.Entry<String,ArrayList<Integer>> entry : word2Groups.entrySet()) { // for each word
|
|
||||||
ArrayList<Integer> group = entry.getValue();
|
|
||||||
String word = entry.getKey();
|
|
||||||
|
|
||||||
// HashSet synonyms = new HashSet();
|
|
||||||
TreeSet<String> synonyms = new TreeSet<String>();
|
|
||||||
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
|
|
||||||
ArrayList<String> words = group2Words.get(group.get(i));
|
|
||||||
for (int j=words.size(); --j >= 0; ) { // add all words
|
|
||||||
String synonym = words.get(j); // note that w and word are interned
|
|
||||||
if (synonym != word) { // a word is implicitly it's own synonym
|
|
||||||
synonyms.add(synonym);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int size = synonyms.size();
|
|
||||||
if (size > 0) {
|
|
||||||
String[] syns = new String[size];
|
|
||||||
if (size == 1)
|
|
||||||
syns[0] = synonyms.first();
|
|
||||||
else
|
|
||||||
synonyms.toArray(syns);
|
|
||||||
// if (syns.length > 1) Arrays.sort(syns);
|
|
||||||
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
|
|
||||||
word2Syns.put(word, syns);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return word2Syns;
|
|
||||||
}
|
|
||||||
|
|
||||||
private HashMap<String,String[]> optimize(HashMap<String,String[]> word2Syns, HashMap<String,String> internedWords) {
|
|
||||||
if (DEBUG) {
|
|
||||||
System.err.println("before gc");
|
|
||||||
for (int i=0; i < 10; i++) System.gc();
|
|
||||||
System.err.println("after gc");
|
|
||||||
}
|
|
||||||
|
|
||||||
// collect entries
|
|
||||||
int len = 0;
|
|
||||||
int size = word2Syns.size();
|
|
||||||
String[][] allSynonyms = new String[size][];
|
|
||||||
String[] words = new String[size];
|
|
||||||
Iterator<Map.Entry<String,String[]>> iter = word2Syns.entrySet().iterator();
|
|
||||||
for (int j=0; j < size; j++) {
|
|
||||||
Map.Entry<String,String[]> entry = iter.next();
|
|
||||||
allSynonyms[j] = entry.getValue();
|
|
||||||
words[j] = entry.getKey();
|
|
||||||
len += words[j].length();
|
|
||||||
}
|
|
||||||
|
|
||||||
// assemble large string containing all words
|
|
||||||
StringBuilder buf = new StringBuilder(len);
|
|
||||||
for (int j=0; j < size; j++) buf.append(words[j]);
|
|
||||||
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
|
|
||||||
buf = null;
|
|
||||||
|
|
||||||
// intern words at app level via memory-overlaid substrings
|
|
||||||
for (int p=0, j=0; j < size; j++) {
|
|
||||||
String word = words[j];
|
|
||||||
internedWords.put(word, allWords.substring(p, p + word.length()));
|
|
||||||
p += word.length();
|
|
||||||
}
|
|
||||||
|
|
||||||
// replace words with interned words
|
|
||||||
for (int j=0; j < size; j++) {
|
|
||||||
String[] syns = allSynonyms[j];
|
|
||||||
for (int k=syns.length; --k >= 0; ) {
|
|
||||||
syns[k] = internedWords.get(syns[k]);
|
|
||||||
}
|
|
||||||
word2Syns.remove(words[j]);
|
|
||||||
word2Syns.put(internedWords.get(words[j]), syns);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (DEBUG) {
|
|
||||||
words = null;
|
|
||||||
allSynonyms = null;
|
|
||||||
internedWords = null;
|
|
||||||
allWords = null;
|
|
||||||
System.err.println("before gc");
|
|
||||||
for (int i=0; i < 10; i++) System.gc();
|
|
||||||
System.err.println("after gc");
|
|
||||||
}
|
|
||||||
return word2Syns;
|
|
||||||
}
|
|
||||||
|
|
||||||
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
|
|
||||||
private static byte[] toByteArray(InputStream input) throws IOException {
|
|
||||||
try {
|
|
||||||
// safe and fast even if input.available() behaves weird or buggy
|
|
||||||
int len = Math.max(256, input.available());
|
|
||||||
byte[] buffer = new byte[len];
|
|
||||||
byte[] output = new byte[len];
|
|
||||||
|
|
||||||
len = 0;
|
|
||||||
int n;
|
|
||||||
while ((n = input.read(buffer)) >= 0) {
|
|
||||||
if (len + n > output.length) { // grow capacity
|
|
||||||
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
|
|
||||||
System.arraycopy(output, 0, tmp, 0, len);
|
|
||||||
System.arraycopy(buffer, 0, tmp, len, n);
|
|
||||||
buffer = output; // use larger buffer for future larger bulk reads
|
|
||||||
output = tmp;
|
|
||||||
} else {
|
|
||||||
System.arraycopy(buffer, 0, output, len, n);
|
|
||||||
}
|
|
||||||
len += n;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (len == output.length) return output;
|
|
||||||
buffer = null; // help gc
|
|
||||||
buffer = new byte[len];
|
|
||||||
System.arraycopy(output, 0, buffer, 0, len);
|
|
||||||
return buffer;
|
|
||||||
} finally {
|
|
||||||
input.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,148 +0,0 @@
|
||||||
package org.apache.lucene.wordnet;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|
||||||
import org.apache.lucene.util.AttributeSource;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Injects additional tokens for synonyms of token terms fetched from the
|
|
||||||
* underlying child stream; the child stream must deliver lowercase tokens
|
|
||||||
* for synonyms to be found.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class SynonymTokenFilter extends TokenFilter {
|
|
||||||
|
|
||||||
/** The Token.type used to indicate a synonym to higher level filters. */
|
|
||||||
public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
|
|
||||||
|
|
||||||
private final SynonymMap synonyms;
|
|
||||||
private final int maxSynonyms;
|
|
||||||
|
|
||||||
private String[] stack = null;
|
|
||||||
private int index = 0;
|
|
||||||
private AttributeSource.State current = null;
|
|
||||||
private int todo = 0;
|
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
|
||||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates an instance for the given underlying stream and synonym table.
|
|
||||||
*
|
|
||||||
* @param input
|
|
||||||
* the underlying child token stream
|
|
||||||
* @param synonyms
|
|
||||||
* the map used to extract synonyms for terms
|
|
||||||
* @param maxSynonyms
|
|
||||||
* the maximum number of synonym tokens to return per underlying
|
|
||||||
* token word (a value of Integer.MAX_VALUE indicates unlimited)
|
|
||||||
*/
|
|
||||||
public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
|
|
||||||
super(input);
|
|
||||||
if (input == null)
|
|
||||||
throw new IllegalArgumentException("input must not be null");
|
|
||||||
if (synonyms == null)
|
|
||||||
throw new IllegalArgumentException("synonyms must not be null");
|
|
||||||
if (maxSynonyms < 0)
|
|
||||||
throw new IllegalArgumentException("maxSynonyms must not be negative");
|
|
||||||
|
|
||||||
this.synonyms = synonyms;
|
|
||||||
this.maxSynonyms = maxSynonyms;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
|
||||||
@Override
|
|
||||||
public final boolean incrementToken() throws IOException {
|
|
||||||
while (todo > 0 && index < stack.length) { // pop from stack
|
|
||||||
if (createToken(stack[index++], current)) {
|
|
||||||
todo--;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!input.incrementToken()) return false; // EOS; iterator exhausted
|
|
||||||
|
|
||||||
stack = synonyms.getSynonyms(termAtt.toString()); // push onto stack
|
|
||||||
if (stack.length > maxSynonyms) randomize(stack);
|
|
||||||
index = 0;
|
|
||||||
current = captureState();
|
|
||||||
todo = maxSynonyms;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates and returns a token for the given synonym of the current input
|
|
||||||
* token; Override for custom (stateless or stateful) behavior, if desired.
|
|
||||||
*
|
|
||||||
* @param synonym
|
|
||||||
* a synonym for the current token's term
|
|
||||||
* @param current
|
|
||||||
* the current token from the underlying child stream
|
|
||||||
* @return a new token, or null to indicate that the given synonym should be
|
|
||||||
* ignored
|
|
||||||
*/
|
|
||||||
protected boolean createToken(String synonym, AttributeSource.State current) {
|
|
||||||
restoreState(current);
|
|
||||||
termAtt.setEmpty().append(synonym);
|
|
||||||
typeAtt.setType(SYNONYM_TOKEN_TYPE);
|
|
||||||
posIncrAtt.setPositionIncrement(0);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Randomize synonyms to later sample a subset. Uses constant random seed
|
|
||||||
* for reproducibility. Uses "DRand", a simple, fast, uniform pseudo-random
|
|
||||||
* number generator with medium statistical quality (multiplicative
|
|
||||||
* congruential method), producing integers in the range [Integer.MIN_VALUE,
|
|
||||||
* Integer.MAX_VALUE].
|
|
||||||
*/
|
|
||||||
private static void randomize(Object[] arr) {
|
|
||||||
int seed = 1234567; // constant
|
|
||||||
int randomState = 4*seed + 1;
|
|
||||||
// Random random = new Random(seed); // unnecessary overhead
|
|
||||||
int len = arr.length;
|
|
||||||
for (int i=0; i < len-1; i++) {
|
|
||||||
randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
|
|
||||||
int r = randomState % (len-i);
|
|
||||||
if (r < 0) r = -r; // e.g. -9 % 2 == -1
|
|
||||||
// int r = random.nextInt(len-i);
|
|
||||||
|
|
||||||
// swap arr[i, i+r]
|
|
||||||
Object tmp = arr[i];
|
|
||||||
arr[i] = arr[i + r];
|
|
||||||
arr[i + r] = tmp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void reset() throws IOException {
|
|
||||||
super.reset();
|
|
||||||
stack = null;
|
|
||||||
index = 0;
|
|
||||||
current = null;
|
|
||||||
todo = 0;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,329 +0,0 @@
|
||||||
package org.apache.lucene.wordnet;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.PrintStream;
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.TreeMap;
|
|
||||||
import java.util.TreeSet;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.document.Field;
|
|
||||||
import org.apache.lucene.index.IndexWriter;
|
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
|
||||||
import org.apache.lucene.index.TieredMergePolicy;
|
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
|
||||||
import org.apache.lucene.store.FSDirectory;
|
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
|
|
||||||
* into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}).
|
|
||||||
*
|
|
||||||
* This has been tested with WordNet 2.0.
|
|
||||||
*
|
|
||||||
* The index has fields named "word" ({@link #F_WORD})
|
|
||||||
* and "syn" ({@link #F_SYN}).
|
|
||||||
* <p>
|
|
||||||
* The source word (such as 'big') can be looked up in the
|
|
||||||
* "word" field, and if present there will be fields named "syn"
|
|
||||||
* for every synonym. What's tricky here is that there could be <b>multiple</b>
|
|
||||||
* fields with the same name, in the general case for words that have multiple synonyms.
|
|
||||||
* That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues}
|
|
||||||
* </p>
|
|
||||||
* <p>
|
|
||||||
* While the WordNet file distinguishes groups of synonyms with
|
|
||||||
* related meanings we don't do that here.
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
|
|
||||||
*
|
|
||||||
* @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
|
|
||||||
* @see <a href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb man page</a>
|
|
||||||
* @see <a href="http://www.hostmon.com/rfc/advanced.jsp">sample site that uses it</a>
|
|
||||||
*/
|
|
||||||
public class Syns2Index
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
private static final PrintStream o = System.out;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
private static final PrintStream err = System.err;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public static final String F_SYN = "syn";
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public static final String F_WORD = "word";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* we don't actually analyze any text (only a NOT_ANALYZED field),
|
|
||||||
* but analyzer can't be null, docinverter wants the offset gap!
|
|
||||||
*/
|
|
||||||
private static final Analyzer ana = new Analyzer() {
|
|
||||||
@Override
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Takes arg of prolog file name and index directory.
|
|
||||||
*/
|
|
||||||
public static void main(String[] args)
|
|
||||||
throws Throwable
|
|
||||||
{
|
|
||||||
// get command line arguments
|
|
||||||
String prologFilename = null; // name of file "wn_s.pl"
|
|
||||||
String indexDir = null;
|
|
||||||
if (args.length == 2)
|
|
||||||
{
|
|
||||||
prologFilename = args[0];
|
|
||||||
indexDir = args[1];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
usage();
|
|
||||||
System.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ensure that the prolog file is readable
|
|
||||||
if (! (new File(prologFilename)).canRead())
|
|
||||||
{
|
|
||||||
err.println("Error: cannot read Prolog file: " + prologFilename);
|
|
||||||
System.exit(1);
|
|
||||||
}
|
|
||||||
// exit if the target index directory already exists
|
|
||||||
if ((new File(indexDir)).isDirectory())
|
|
||||||
{
|
|
||||||
err.println("Error: index directory already exists: " + indexDir);
|
|
||||||
err.println("Please specify a name of a non-existent directory");
|
|
||||||
System.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
o.println("Opening Prolog file " + prologFilename);
|
|
||||||
final FileInputStream fis = new FileInputStream(prologFilename);
|
|
||||||
final BufferedReader br = new BufferedReader(new InputStreamReader(fis));
|
|
||||||
String line;
|
|
||||||
|
|
||||||
// maps a word to all the "groups" it's in
|
|
||||||
final Map<String,List<String>> word2Nums = new TreeMap<String,List<String>>();
|
|
||||||
// maps a group to all the words in it
|
|
||||||
final Map<String,List<String>> num2Words = new TreeMap<String,List<String>>();
|
|
||||||
// number of rejected words
|
|
||||||
int ndecent = 0;
|
|
||||||
|
|
||||||
// status output
|
|
||||||
int mod = 1;
|
|
||||||
int row = 1;
|
|
||||||
// parse prolog file
|
|
||||||
o.println( "[1/2] Parsing " + prologFilename);
|
|
||||||
while ((line = br.readLine()) != null)
|
|
||||||
{
|
|
||||||
// occasional progress
|
|
||||||
if ((++row) % mod == 0) // periodically print out line we read in
|
|
||||||
{
|
|
||||||
mod *= 2;
|
|
||||||
o.println("\t" + row + " " + line + " " + word2Nums.size()
|
|
||||||
+ " " + num2Words.size() + " ndecent=" + ndecent);
|
|
||||||
}
|
|
||||||
|
|
||||||
// syntax check
|
|
||||||
if (! line.startsWith("s("))
|
|
||||||
{
|
|
||||||
err.println("OUCH: " + line);
|
|
||||||
System.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// parse line
|
|
||||||
line = line.substring(2);
|
|
||||||
int comma = line.indexOf(',');
|
|
||||||
String num = line.substring(0, comma);
|
|
||||||
int q1 = line.indexOf('\'');
|
|
||||||
line = line.substring(q1 + 1);
|
|
||||||
int q2 = line.lastIndexOf('\'');
|
|
||||||
String word = line.substring(0, q2).toLowerCase().replace("''", "'");
|
|
||||||
|
|
||||||
// make sure is a normal word
|
|
||||||
if (! isDecent(word))
|
|
||||||
{
|
|
||||||
ndecent++;
|
|
||||||
continue; // don't store words w/ spaces
|
|
||||||
}
|
|
||||||
|
|
||||||
// 1/2: word2Nums map
|
|
||||||
// append to entry or add new one
|
|
||||||
List<String> lis = word2Nums.get(word);
|
|
||||||
if (lis == null)
|
|
||||||
{
|
|
||||||
lis = new LinkedList<String>();
|
|
||||||
lis.add(num);
|
|
||||||
word2Nums.put(word, lis);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
lis.add(num);
|
|
||||||
|
|
||||||
// 2/2: num2Words map
|
|
||||||
lis = num2Words.get(num);
|
|
||||||
if (lis == null)
|
|
||||||
{
|
|
||||||
lis = new LinkedList<String>();
|
|
||||||
lis.add(word);
|
|
||||||
num2Words.put(num, lis);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
lis.add(word);
|
|
||||||
}
|
|
||||||
|
|
||||||
// close the streams
|
|
||||||
fis.close();
|
|
||||||
br.close();
|
|
||||||
|
|
||||||
// create the index
|
|
||||||
o.println( "[2/2] Building index to store synonyms, " +
|
|
||||||
" map sizes are " + word2Nums.size() + " and " + num2Words.size());
|
|
||||||
index(indexDir, word2Nums, num2Words);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks to see if a word contains only alphabetic characters by
|
|
||||||
* checking it one character at a time.
|
|
||||||
*
|
|
||||||
* @param s string to check
|
|
||||||
* @return <code>true</code> if the string is decent
|
|
||||||
*/
|
|
||||||
private static boolean isDecent(String s)
|
|
||||||
{
|
|
||||||
int len = s.length();
|
|
||||||
for (int i = 0; i < len; i++)
|
|
||||||
{
|
|
||||||
if (!Character.isLetter(s.charAt(i)))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Forms a Lucene index based on the 2 maps.
|
|
||||||
*
|
|
||||||
* @param indexDir the directory where the index should be created
|
|
||||||
* @param word2Nums
|
|
||||||
* @param num2Words
|
|
||||||
*/
|
|
||||||
private static void index(String indexDir, Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words)
|
|
||||||
throws Throwable
|
|
||||||
{
|
|
||||||
int row = 0;
|
|
||||||
int mod = 1;
|
|
||||||
FSDirectory dir = FSDirectory.open(new File(indexDir));
|
|
||||||
try {
|
|
||||||
|
|
||||||
// override the specific index if it already exists
|
|
||||||
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
|
|
||||||
Version.LUCENE_CURRENT, ana).setOpenMode(OpenMode.CREATE));
|
|
||||||
((TieredMergePolicy) writer.getConfig().getMergePolicy()).setUseCompoundFile(true); // why?
|
|
||||||
Iterator<String> i1 = word2Nums.keySet().iterator();
|
|
||||||
while (i1.hasNext()) // for each word
|
|
||||||
{
|
|
||||||
String g = i1.next();
|
|
||||||
Document doc = new Document();
|
|
||||||
|
|
||||||
int n = index(word2Nums, num2Words, g, doc);
|
|
||||||
if (n > 0)
|
|
||||||
{
|
|
||||||
doc.add( new Field( F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
|
||||||
if ((++row % mod) == 0)
|
|
||||||
{
|
|
||||||
o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc);
|
|
||||||
mod *= 2;
|
|
||||||
}
|
|
||||||
writer.addDocument(doc);
|
|
||||||
} // else degenerate
|
|
||||||
}
|
|
||||||
o.println( "Optimizing..");
|
|
||||||
writer.optimize();
|
|
||||||
writer.close();
|
|
||||||
} finally {
|
|
||||||
dir.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given the 2 maps fills a document for 1 word.
|
|
||||||
*/
|
|
||||||
private static int index(Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words, String g, Document doc)
|
|
||||||
throws Throwable
|
|
||||||
{
|
|
||||||
List<String> keys = word2Nums.get(g); // get list of key#'s
|
|
||||||
Iterator<String> i2 = keys.iterator();
|
|
||||||
|
|
||||||
Set<String> already = new TreeSet<String>(); // keep them sorted
|
|
||||||
|
|
||||||
// pass 1: fill up 'already' with all words
|
|
||||||
while (i2.hasNext()) // for each key#
|
|
||||||
{
|
|
||||||
already.addAll(num2Words.get(i2.next())); // get list of words
|
|
||||||
}
|
|
||||||
int num = 0;
|
|
||||||
already.remove(g); // of course a word is it's own syn
|
|
||||||
Iterator<String> it = already.iterator();
|
|
||||||
while (it.hasNext())
|
|
||||||
{
|
|
||||||
String cur = it.next();
|
|
||||||
// don't store things like 'pit bull' -> 'american pit bull'
|
|
||||||
if (!isDecent(cur))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
num++;
|
|
||||||
doc.add( new Field( F_SYN, cur, Field.Store.YES, Field.Index.NO));
|
|
||||||
}
|
|
||||||
return num;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
private static void usage()
|
|
||||||
{
|
|
||||||
o.println("\n\n" +
|
|
||||||
"java org.apache.lucene.wordnet.Syns2Index <prolog file> <index dir>\n\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,57 +0,0 @@
|
||||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
|
||||||
<!--
|
|
||||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
-->
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>WordNet Lucene Synonyms Integration</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
|
|
||||||
This package uses synonyms defined by <a href="http://www.cogsci.princeton.edu/~wn/">WordNet</a>.
|
|
||||||
There are two methods: query expansion and analysis.
|
|
||||||
|
|
||||||
Both methods first require you to download the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog database</a>
|
|
||||||
Inside this archive is a file named wn_s.pl, which contains the WordNet synonyms.
|
|
||||||
|
|
||||||
<h1>Query Expansion Method</h1>
|
|
||||||
This method creates Lucene index storing the synonyms, which in turn can be used for query expansion.
|
|
||||||
|
|
||||||
You normally run {@link org.apache.lucene.wordnet.Syns2Index} once to build the query index/"database", and then call
|
|
||||||
{@link org.apache.lucene.wordnet.SynExpand#expand SynExpand.expand(...)} to expand a query.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
|
|
||||||
<h3> Instructions </h3>
|
|
||||||
<ol>
|
|
||||||
<li> Invoke Syn2Index as appropriate to build a synonym index.
|
|
||||||
It'll take 2 arguments, the path to wn_s.pl from the WordNet download, and the index name.
|
|
||||||
|
|
||||||
<li> Update your UI so that as appropriate you call SynExpand.expand(...) to expand user queries with synonyms.
|
|
||||||
</ol>
|
|
||||||
|
|
||||||
<h1>Analysis Method</h1>
|
|
||||||
This method injects additional synonym tokens for tokens from a child {@link org.apache.lucene.analysis.TokenStream}.
|
|
||||||
|
|
||||||
<h3> Instructions </h3>
|
|
||||||
<ol>
|
|
||||||
<li>Create a {@link org.apache.lucene.wordnet.SynonymMap}, passing in the path to wn_s.pl
|
|
||||||
<li>Add a {@link org.apache.lucene.wordnet.SynonymTokenFilter} to your analyzer. Note: SynonymTokenFilter should be after LowerCaseFilter,
|
|
||||||
because it expects terms to already be in lowercase.
|
|
||||||
</ol>
|
|
||||||
|
|
||||||
</body>
|
|
||||||
</html>
|
|
|
@ -1,119 +0,0 @@
|
||||||
package org.apache.lucene.wordnet;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|
||||||
|
|
||||||
public class TestSynonymTokenFilter extends BaseTokenStreamTestCase {
|
|
||||||
final String testFile = "testSynonyms.txt";
|
|
||||||
|
|
||||||
public void testSynonyms() throws Exception {
|
|
||||||
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
|
|
||||||
/* all expansions */
|
|
||||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
|
|
||||||
assertAnalyzesTo(analyzer, "Lost in the woods",
|
|
||||||
new String[] { "lost", "in", "the", "woods", "forest", "wood" },
|
|
||||||
new int[] { 0, 5, 8, 12, 12, 12 },
|
|
||||||
new int[] { 4, 7, 11, 17, 17, 17 },
|
|
||||||
new int[] { 1, 1, 1, 1, 0, 0 });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testSynonymsSingleQuote() throws Exception {
|
|
||||||
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
|
|
||||||
/* all expansions */
|
|
||||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
|
|
||||||
assertAnalyzesTo(analyzer, "king",
|
|
||||||
new String[] { "king", "baron" });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testSynonymsLimitedAmount() throws Exception {
|
|
||||||
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
|
|
||||||
/* limit to one synonym expansion */
|
|
||||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
|
|
||||||
assertAnalyzesTo(analyzer, "Lost in the woods",
|
|
||||||
/* wood comes before forest due to
|
|
||||||
* the input file, not lexicographic order
|
|
||||||
*/
|
|
||||||
new String[] { "lost", "in", "the", "woods", "wood" },
|
|
||||||
new int[] { 0, 5, 8, 12, 12 },
|
|
||||||
new int[] { 4, 7, 11, 17, 17 },
|
|
||||||
new int[] { 1, 1, 1, 1, 0 });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
|
||||||
SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
|
|
||||||
/* limit to one synonym expansion */
|
|
||||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
|
|
||||||
assertAnalyzesToReuse(analyzer, "Lost in the woods",
|
|
||||||
new String[] { "lost", "in", "the", "woods", "wood" },
|
|
||||||
new int[] { 0, 5, 8, 12, 12 },
|
|
||||||
new int[] { 4, 7, 11, 17, 17 },
|
|
||||||
new int[] { 1, 1, 1, 1, 0 });
|
|
||||||
assertAnalyzesToReuse(analyzer, "My wolfish dog went to the forest",
|
|
||||||
new String[] { "my", "wolfish", "ravenous", "dog", "went", "to",
|
|
||||||
"the", "forest", "woods" },
|
|
||||||
new int[] { 0, 3, 3, 11, 15, 20, 23, 27, 27 },
|
|
||||||
new int[] { 2, 10, 10, 14, 19, 22, 26, 33, 33 },
|
|
||||||
new int[] { 1, 1, 0, 1, 1, 1, 1, 1, 0 });
|
|
||||||
}
|
|
||||||
|
|
||||||
private class SynonymWhitespaceAnalyzer extends Analyzer {
|
|
||||||
private SynonymMap synonyms;
|
|
||||||
private int maxSynonyms;
|
|
||||||
|
|
||||||
public SynonymWhitespaceAnalyzer(SynonymMap synonyms, int maxSynonyms) {
|
|
||||||
this.synonyms = synonyms;
|
|
||||||
this.maxSynonyms = maxSynonyms;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
|
||||||
TokenStream ts = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
|
|
||||||
ts = new SynonymTokenFilter(ts, synonyms, maxSynonyms);
|
|
||||||
return ts;
|
|
||||||
}
|
|
||||||
|
|
||||||
private class SavedStreams {
|
|
||||||
Tokenizer source;
|
|
||||||
TokenStream result;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
|
||||||
throws IOException {
|
|
||||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
|
||||||
if (streams == null) {
|
|
||||||
streams = new SavedStreams();
|
|
||||||
streams.source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
|
|
||||||
streams.result = new SynonymTokenFilter(streams.source, synonyms, maxSynonyms);
|
|
||||||
setPreviousTokenStream(streams);
|
|
||||||
} else {
|
|
||||||
streams.source.reset(reader);
|
|
||||||
}
|
|
||||||
return streams.result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,94 +0,0 @@
|
||||||
package org.apache.lucene.wordnet;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
|
||||||
import org.apache.lucene.index.Term;
|
|
||||||
import org.apache.lucene.search.BooleanClause;
|
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.search.TermQuery;
|
|
||||||
import org.apache.lucene.store.Directory;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import org.apache.lucene.util._TestUtil;
|
|
||||||
|
|
||||||
public class TestWordnet extends LuceneTestCase {
|
|
||||||
private IndexSearcher searcher;
|
|
||||||
private Directory dir;
|
|
||||||
|
|
||||||
String storePathName = new File(TEMP_DIR,"testLuceneWordnet").getAbsolutePath();
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setUp() throws Exception {
|
|
||||||
super.setUp();
|
|
||||||
// create a temporary synonym index
|
|
||||||
File testFile = getDataFile("testSynonyms.txt");
|
|
||||||
String commandLineArgs[] = { testFile.getAbsolutePath(), storePathName };
|
|
||||||
_TestUtil.rmDir(new File(storePathName));
|
|
||||||
|
|
||||||
try {
|
|
||||||
Syns2Index.main(commandLineArgs);
|
|
||||||
} catch (Throwable t) { throw new RuntimeException(t); }
|
|
||||||
|
|
||||||
dir = newFSDirectory(new File(storePathName));
|
|
||||||
searcher = new IndexSearcher(dir, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testExpansion() throws IOException {
|
|
||||||
assertExpandsTo("woods", new String[] { "woods", "forest", "wood" });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testExpansionSingleQuote() throws IOException {
|
|
||||||
assertExpandsTo("king", new String[] { "king", "baron" });
|
|
||||||
}
|
|
||||||
|
|
||||||
private void assertExpandsTo(String term, String expected[]) throws IOException {
|
|
||||||
Query expandedQuery = SynExpand.expand(term, searcher, new
|
|
||||||
MockAnalyzer(random), "field", 1F);
|
|
||||||
BooleanQuery expectedQuery = new BooleanQuery();
|
|
||||||
for (String t : expected)
|
|
||||||
expectedQuery.add(new TermQuery(new Term("field", t)),
|
|
||||||
BooleanClause.Occur.SHOULD);
|
|
||||||
assertEquals(expectedQuery, expandedQuery);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void tearDown() throws Exception {
|
|
||||||
if (searcher != null) {
|
|
||||||
searcher.close();
|
|
||||||
}
|
|
||||||
if (dir != null) {
|
|
||||||
dir.close();
|
|
||||||
}
|
|
||||||
rmDir(storePathName); // delete our temporary synonym index
|
|
||||||
super.tearDown();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void rmDir(String directory) {
|
|
||||||
File dir = new File(directory);
|
|
||||||
File[] files = dir.listFiles();
|
|
||||||
for (int i = 0; i < files.length; i++) {
|
|
||||||
files[i].delete();
|
|
||||||
}
|
|
||||||
dir.delete();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,9 +0,0 @@
|
||||||
s(100000001,1,'woods',n,1,0).
|
|
||||||
s(100000001,2,'wood',n,1,0).
|
|
||||||
s(100000001,3,'forest',n,1,0).
|
|
||||||
s(100000002,1,'wolfish',n,1,0).
|
|
||||||
s(100000002,2,'ravenous',n,1,0).
|
|
||||||
s(100000003,1,'king',n,1,1).
|
|
||||||
s(100000003,2,'baron',n,1,1).
|
|
||||||
s(100000004,1,'king''sevil',n,1,1).
|
|
||||||
s(100000004,2,'meany',n,1,1).
|
|
|
@ -95,9 +95,6 @@ public class MemoryCodec extends Codec {
|
||||||
this.out = out;
|
this.out = out;
|
||||||
this.field = field;
|
this.field = field;
|
||||||
builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
|
builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
// The byte[] output we create can easily be > 255 bytes:
|
|
||||||
builder.setAllowArrayArcs(false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private class PostingsWriter extends PostingsConsumer {
|
private class PostingsWriter extends PostingsConsumer {
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
package org.apache.lucene.store;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class ByteArrayDataOutput extends DataOutput {
|
||||||
|
private byte[] bytes;
|
||||||
|
|
||||||
|
private int pos;
|
||||||
|
private int limit;
|
||||||
|
|
||||||
|
public ByteArrayDataOutput(byte[] bytes) {
|
||||||
|
reset(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ByteArrayDataOutput(byte[] bytes, int offset, int len) {
|
||||||
|
reset(bytes, offset, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ByteArrayDataOutput() {
|
||||||
|
reset(BytesRef.EMPTY_BYTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset(byte[] bytes) {
|
||||||
|
reset(bytes, 0, bytes.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset(byte[] bytes, int offset, int len) {
|
||||||
|
this.bytes = bytes;
|
||||||
|
pos = offset;
|
||||||
|
limit = offset + len;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getPosition() {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeByte(byte b) {
|
||||||
|
assert pos < limit;
|
||||||
|
bytes[pos++] = b;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeBytes(byte[] b, int offset, int length) {
|
||||||
|
assert pos + length <= limit;
|
||||||
|
System.arraycopy(b, offset, bytes, pos, length);
|
||||||
|
pos += length;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,5 +1,7 @@
|
||||||
package org.apache.lucene.util;
|
package org.apache.lucene.util;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -167,7 +169,11 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence {
|
||||||
* the {@link CharsRef} to copy
|
* the {@link CharsRef} to copy
|
||||||
*/
|
*/
|
||||||
public void copy(CharsRef other) {
|
public void copy(CharsRef other) {
|
||||||
chars = ArrayUtil.grow(chars, other.length);
|
if (chars == null) {
|
||||||
|
chars = new char[other.length];
|
||||||
|
} else {
|
||||||
|
chars = ArrayUtil.grow(chars, other.length);
|
||||||
|
}
|
||||||
System.arraycopy(other.chars, other.offset, chars, 0, other.length);
|
System.arraycopy(other.chars, other.offset, chars, 0, other.length);
|
||||||
length = other.length;
|
length = other.length;
|
||||||
offset = 0;
|
offset = 0;
|
||||||
|
@ -213,4 +219,56 @@ public final class CharsRef implements Comparable<CharsRef>, CharSequence {
|
||||||
public CharSequence subSequence(int start, int end) {
|
public CharSequence subSequence(int start, int end) {
|
||||||
return new CharsRef(chars, offset + start, offset + end - 1);
|
return new CharsRef(chars, offset + start, offset + end - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final static Comparator<CharsRef> utf16SortedAsUTF8SortOrder = new UTF16SortedAsUTF8Comparator();
|
||||||
|
|
||||||
|
public static Comparator<CharsRef> getUTF16SortedAsUTF8Comparator() {
|
||||||
|
return utf16SortedAsUTF8SortOrder;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class UTF16SortedAsUTF8Comparator implements Comparator<CharsRef> {
|
||||||
|
// Only singleton
|
||||||
|
private UTF16SortedAsUTF8Comparator() {};
|
||||||
|
|
||||||
|
public int compare(CharsRef a, CharsRef b) {
|
||||||
|
if (a == b)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
final char[] aChars = a.chars;
|
||||||
|
int aUpto = a.offset;
|
||||||
|
final char[] bChars = b.chars;
|
||||||
|
int bUpto = b.offset;
|
||||||
|
|
||||||
|
final int aStop = aUpto + Math.min(a.length, b.length);
|
||||||
|
|
||||||
|
while (aUpto < aStop) {
|
||||||
|
char aChar = aChars[aUpto++];
|
||||||
|
char bChar = bChars[bUpto++];
|
||||||
|
if (aChar != bChar) {
|
||||||
|
// http://icu-project.org/docs/papers/utf16_code_point_order.html
|
||||||
|
|
||||||
|
/* aChar != bChar, fix up each one if they're both in or above the surrogate range, then compare them */
|
||||||
|
if (aChar >= 0xd800 && bChar >= 0xd800) {
|
||||||
|
if (aChar >= 0xe000) {
|
||||||
|
aChar -= 0x800;
|
||||||
|
} else {
|
||||||
|
aChar += 0x2000;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bChar >= 0xe000) {
|
||||||
|
bChar -= 0x800;
|
||||||
|
} else {
|
||||||
|
bChar += 0x2000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* now aChar and bChar are in code point order */
|
||||||
|
return (int)aChar - (int)bChar; /* int must be 32 bits wide */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// One is a prefix of the other, or, they are equal:
|
||||||
|
return a.length - b.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -71,7 +71,11 @@ public class FST<T> {
|
||||||
// Increment version to change it
|
// Increment version to change it
|
||||||
private final static String FILE_FORMAT_NAME = "FST";
|
private final static String FILE_FORMAT_NAME = "FST";
|
||||||
private final static int VERSION_START = 0;
|
private final static int VERSION_START = 0;
|
||||||
private final static int VERSION_CURRENT = VERSION_START;
|
|
||||||
|
/** Changed numBytesPerArc for array'd case from byte to int. */
|
||||||
|
private final static int VERSION_INT_NUM_BYTES_PER_ARC = 1;
|
||||||
|
|
||||||
|
private final static int VERSION_CURRENT = VERSION_INT_NUM_BYTES_PER_ARC;
|
||||||
|
|
||||||
// Never serialized; just used to represent the virtual
|
// Never serialized; just used to represent the virtual
|
||||||
// final node w/ no arcs:
|
// final node w/ no arcs:
|
||||||
|
@ -106,6 +110,8 @@ public class FST<T> {
|
||||||
|
|
||||||
private boolean allowArrayArcs = true;
|
private boolean allowArrayArcs = true;
|
||||||
|
|
||||||
|
private Arc<T> cachedRootArcs[];
|
||||||
|
|
||||||
public final static class Arc<T> {
|
public final static class Arc<T> {
|
||||||
public int label;
|
public int label;
|
||||||
public T output;
|
public T output;
|
||||||
|
@ -113,7 +119,7 @@ public class FST<T> {
|
||||||
int target;
|
int target;
|
||||||
|
|
||||||
byte flags;
|
byte flags;
|
||||||
T nextFinalOutput;
|
public T nextFinalOutput;
|
||||||
int nextArc;
|
int nextArc;
|
||||||
|
|
||||||
// This is non-zero if current arcs are fixed array:
|
// This is non-zero if current arcs are fixed array:
|
||||||
|
@ -176,7 +182,7 @@ public class FST<T> {
|
||||||
public FST(DataInput in, Outputs<T> outputs) throws IOException {
|
public FST(DataInput in, Outputs<T> outputs) throws IOException {
|
||||||
this.outputs = outputs;
|
this.outputs = outputs;
|
||||||
writer = null;
|
writer = null;
|
||||||
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_START);
|
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_INT_NUM_BYTES_PER_ARC, VERSION_INT_NUM_BYTES_PER_ARC);
|
||||||
if (in.readByte() == 1) {
|
if (in.readByte() == 1) {
|
||||||
// accepts empty string
|
// accepts empty string
|
||||||
int numBytes = in.readVInt();
|
int numBytes = in.readVInt();
|
||||||
|
@ -209,6 +215,8 @@ public class FST<T> {
|
||||||
bytes = new byte[in.readVInt()];
|
bytes = new byte[in.readVInt()];
|
||||||
in.readBytes(bytes, 0, bytes.length);
|
in.readBytes(bytes, 0, bytes.length);
|
||||||
NO_OUTPUT = outputs.getNoOutput();
|
NO_OUTPUT = outputs.getNoOutput();
|
||||||
|
|
||||||
|
cacheRootArcs();
|
||||||
}
|
}
|
||||||
|
|
||||||
public INPUT_TYPE getInputType() {
|
public INPUT_TYPE getInputType() {
|
||||||
|
@ -220,7 +228,7 @@ public class FST<T> {
|
||||||
return bytes.length;
|
return bytes.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
void finish(int startNode) {
|
void finish(int startNode) throws IOException {
|
||||||
if (startNode == FINAL_END_NODE && emptyOutput != null) {
|
if (startNode == FINAL_END_NODE && emptyOutput != null) {
|
||||||
startNode = 0;
|
startNode = 0;
|
||||||
}
|
}
|
||||||
|
@ -231,6 +239,32 @@ public class FST<T> {
|
||||||
System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite);
|
System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite);
|
||||||
bytes = finalBytes;
|
bytes = finalBytes;
|
||||||
this.startNode = startNode;
|
this.startNode = startNode;
|
||||||
|
|
||||||
|
cacheRootArcs();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Caches first 128 labels
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
private void cacheRootArcs() throws IOException {
|
||||||
|
cachedRootArcs = (FST.Arc<T>[]) new FST.Arc[0x80];
|
||||||
|
final FST.Arc<T> arc = new FST.Arc<T>();
|
||||||
|
getFirstArc(arc);
|
||||||
|
final BytesReader in = getBytesReader(0);
|
||||||
|
if (targetHasArcs(arc)) {
|
||||||
|
readFirstRealArc(arc.target, arc);
|
||||||
|
while(true) {
|
||||||
|
assert arc.label != END_LABEL;
|
||||||
|
if (arc.label < cachedRootArcs.length) {
|
||||||
|
cachedRootArcs[arc.label] = new Arc<T>().copyFrom(arc);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (arc.isLast()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
readNextRealArc(arc, in);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void setEmptyOutput(T v) throws IOException {
|
void setEmptyOutput(T v) throws IOException {
|
||||||
|
@ -345,8 +379,9 @@ public class FST<T> {
|
||||||
writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY);
|
writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY);
|
||||||
writer.writeVInt(node.numArcs);
|
writer.writeVInt(node.numArcs);
|
||||||
// placeholder -- we'll come back and write the number
|
// placeholder -- we'll come back and write the number
|
||||||
// of bytes per arc here:
|
// of bytes per arc (int) here:
|
||||||
writer.writeByte((byte) 0);
|
// TODO: we could make this a vInt instead
|
||||||
|
writer.writeInt(0);
|
||||||
fixedArrayStart = writer.posWrite;
|
fixedArrayStart = writer.posWrite;
|
||||||
//System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart);
|
//System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart);
|
||||||
} else {
|
} else {
|
||||||
|
@ -421,15 +456,21 @@ public class FST<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: if arc'd arrays will be "too wasteful" by some
|
||||||
|
// measure, eg if arcs have vastly different sized
|
||||||
|
// outputs, then we should selectively disable array for
|
||||||
|
// such cases
|
||||||
|
|
||||||
if (doFixedArray) {
|
if (doFixedArray) {
|
||||||
assert maxBytesPerArc > 0;
|
assert maxBytesPerArc > 0;
|
||||||
// 2nd pass just "expands" all arcs to take up a fixed
|
// 2nd pass just "expands" all arcs to take up a fixed
|
||||||
// byte size
|
// byte size
|
||||||
final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc;
|
final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc;
|
||||||
bytes = ArrayUtil.grow(bytes, sizeNeeded);
|
bytes = ArrayUtil.grow(bytes, sizeNeeded);
|
||||||
if (maxBytesPerArc > 255) {
|
// TODO: we could make this a vInt instead
|
||||||
throw new IllegalStateException("max arc size is too large (" + maxBytesPerArc + "); disable array arcs by calling Builder.setAllowArrayArcs(false)");
|
bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24);
|
||||||
}
|
bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16);
|
||||||
|
bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8);
|
||||||
bytes[fixedArrayStart-1] = (byte) maxBytesPerArc;
|
bytes[fixedArrayStart-1] = (byte) maxBytesPerArc;
|
||||||
|
|
||||||
// expand the arcs in place, backwards
|
// expand the arcs in place, backwards
|
||||||
|
@ -502,7 +543,7 @@ public class FST<T> {
|
||||||
if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
|
if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
|
||||||
// array: jump straight to end
|
// array: jump straight to end
|
||||||
arc.numArcs = in.readVInt();
|
arc.numArcs = in.readVInt();
|
||||||
arc.bytesPerArc = in.readByte() & 0xFF;
|
arc.bytesPerArc = in.readInt();
|
||||||
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
|
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
|
||||||
arc.posArcsStart = in.pos;
|
arc.posArcsStart = in.pos;
|
||||||
arc.arcIdx = arc.numArcs - 2;
|
arc.arcIdx = arc.numArcs - 2;
|
||||||
|
@ -528,7 +569,7 @@ public class FST<T> {
|
||||||
}
|
}
|
||||||
arc.nextArc = in.pos+1;
|
arc.nextArc = in.pos+1;
|
||||||
}
|
}
|
||||||
readNextRealArc(arc);
|
readNextRealArc(arc, in);
|
||||||
assert arc.isLast();
|
assert arc.isLast();
|
||||||
return arc;
|
return arc;
|
||||||
}
|
}
|
||||||
|
@ -572,7 +613,7 @@ public class FST<T> {
|
||||||
//System.out.println(" fixedArray");
|
//System.out.println(" fixedArray");
|
||||||
// this is first arc in a fixed-array
|
// this is first arc in a fixed-array
|
||||||
arc.numArcs = in.readVInt();
|
arc.numArcs = in.readVInt();
|
||||||
arc.bytesPerArc = in.readByte() & 0xFF;
|
arc.bytesPerArc = in.readInt();
|
||||||
arc.arcIdx = -1;
|
arc.arcIdx = -1;
|
||||||
arc.nextArc = arc.posArcsStart = in.pos;
|
arc.nextArc = arc.posArcsStart = in.pos;
|
||||||
//System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
|
//System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
|
||||||
|
@ -580,7 +621,7 @@ public class FST<T> {
|
||||||
arc.nextArc = address;
|
arc.nextArc = address;
|
||||||
arc.bytesPerArc = 0;
|
arc.bytesPerArc = 0;
|
||||||
}
|
}
|
||||||
return readNextRealArc(arc);
|
return readNextRealArc(arc, in);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -609,7 +650,7 @@ public class FST<T> {
|
||||||
}
|
}
|
||||||
return readFirstRealArc(arc.nextArc, arc);
|
return readFirstRealArc(arc.nextArc, arc);
|
||||||
} else {
|
} else {
|
||||||
return readNextRealArc(arc);
|
return readNextRealArc(arc, getBytesReader(0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -627,7 +668,7 @@ public class FST<T> {
|
||||||
//System.out.println(" nextArc fake array");
|
//System.out.println(" nextArc fake array");
|
||||||
in.pos--;
|
in.pos--;
|
||||||
in.readVInt();
|
in.readVInt();
|
||||||
in.readByte();
|
in.readInt();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (arc.bytesPerArc != 0) {
|
if (arc.bytesPerArc != 0) {
|
||||||
|
@ -645,17 +686,16 @@ public class FST<T> {
|
||||||
return readLabel(in);
|
return readLabel(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
Arc<T> readNextRealArc(Arc<T> arc) throws IOException {
|
Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
|
||||||
// this is a continuing arc in a fixed array
|
// this is a continuing arc in a fixed array
|
||||||
final BytesReader in;
|
|
||||||
if (arc.bytesPerArc != 0) {
|
if (arc.bytesPerArc != 0) {
|
||||||
// arcs are at fixed entries
|
// arcs are at fixed entries
|
||||||
arc.arcIdx++;
|
arc.arcIdx++;
|
||||||
assert arc.arcIdx < arc.numArcs;
|
assert arc.arcIdx < arc.numArcs;
|
||||||
in = getBytesReader(arc.posArcsStart - arc.arcIdx*arc.bytesPerArc);
|
in.pos = arc.posArcsStart - arc.arcIdx*arc.bytesPerArc;
|
||||||
} else {
|
} else {
|
||||||
// arcs are packed
|
// arcs are packed
|
||||||
in = getBytesReader(arc.nextArc);
|
in.pos = arc.nextArc;
|
||||||
}
|
}
|
||||||
arc.flags = in.readByte();
|
arc.flags = in.readByte();
|
||||||
arc.label = readLabel(in);
|
arc.label = readLabel(in);
|
||||||
|
@ -701,6 +741,17 @@ public class FST<T> {
|
||||||
/** Finds an arc leaving the incoming arc, replacing the arc in place.
|
/** Finds an arc leaving the incoming arc, replacing the arc in place.
|
||||||
* This returns null if the arc was not found, else the incoming arc. */
|
* This returns null if the arc was not found, else the incoming arc. */
|
||||||
public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc) throws IOException {
|
public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc) throws IOException {
|
||||||
|
assert cachedRootArcs != null;
|
||||||
|
// Short-circuit if this arc is in the root arc cache:
|
||||||
|
if (follow.target == startNode && labelToMatch != END_LABEL && labelToMatch < cachedRootArcs.length) {
|
||||||
|
final Arc<T> result = cachedRootArcs[labelToMatch];
|
||||||
|
if (result == null) {
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
arc.copyFrom(result);
|
||||||
|
return arc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (labelToMatch == END_LABEL) {
|
if (labelToMatch == END_LABEL) {
|
||||||
if (follow.isFinal()) {
|
if (follow.isFinal()) {
|
||||||
|
@ -726,14 +777,18 @@ public class FST<T> {
|
||||||
// reusable stuff eg BytesReader:
|
// reusable stuff eg BytesReader:
|
||||||
final BytesReader in = getBytesReader(follow.target);
|
final BytesReader in = getBytesReader(follow.target);
|
||||||
|
|
||||||
|
// System.out.println("fta label=" + (char) labelToMatch);
|
||||||
|
|
||||||
if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) {
|
if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) {
|
||||||
// Arcs are full array; do binary search:
|
// Arcs are full array; do binary search:
|
||||||
arc.numArcs = in.readVInt();
|
arc.numArcs = in.readVInt();
|
||||||
arc.bytesPerArc = in.readByte() & 0xFF;
|
//System.out.println(" bs " + arc.numArcs);
|
||||||
|
arc.bytesPerArc = in.readInt();
|
||||||
arc.posArcsStart = in.pos;
|
arc.posArcsStart = in.pos;
|
||||||
int low = 0;
|
int low = 0;
|
||||||
int high = arc.numArcs-1;
|
int high = arc.numArcs-1;
|
||||||
while (low <= high) {
|
while (low <= high) {
|
||||||
|
//System.out.println(" cycle");
|
||||||
int mid = (low + high) >>> 1;
|
int mid = (low + high) >>> 1;
|
||||||
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
|
in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
|
||||||
int midLabel = readLabel(in);
|
int midLabel = readLabel(in);
|
||||||
|
@ -744,7 +799,8 @@ public class FST<T> {
|
||||||
high = mid - 1;
|
high = mid - 1;
|
||||||
else {
|
else {
|
||||||
arc.arcIdx = mid-1;
|
arc.arcIdx = mid-1;
|
||||||
return readNextRealArc(arc);
|
//System.out.println(" found!");
|
||||||
|
return readNextRealArc(arc, in);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -754,7 +810,12 @@ public class FST<T> {
|
||||||
// Linear scan
|
// Linear scan
|
||||||
readFirstTargetArc(follow, arc);
|
readFirstTargetArc(follow, arc);
|
||||||
while(true) {
|
while(true) {
|
||||||
|
//System.out.println(" non-bs cycle");
|
||||||
|
// TODO: we should fix this code to not have to create
|
||||||
|
// object for the output of every arc we scan... only
|
||||||
|
// for the matching arc, if found
|
||||||
if (arc.label == labelToMatch) {
|
if (arc.label == labelToMatch) {
|
||||||
|
//System.out.println(" found!");
|
||||||
return arc;
|
return arc;
|
||||||
} else if (arc.label > labelToMatch) {
|
} else if (arc.label > labelToMatch) {
|
||||||
return null;
|
return null;
|
||||||
|
@ -863,7 +924,7 @@ public class FST<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Non-static: reads byte[] from FST
|
// Non-static: reads byte[] from FST
|
||||||
class BytesReader extends DataInput {
|
final class BytesReader extends DataInput {
|
||||||
int pos;
|
int pos;
|
||||||
|
|
||||||
public BytesReader(int pos) {
|
public BytesReader(int pos) {
|
||||||
|
|
|
@ -170,7 +170,7 @@ abstract class FSTEnum<T> {
|
||||||
if (found) {
|
if (found) {
|
||||||
// Match
|
// Match
|
||||||
arc.arcIdx = mid-1;
|
arc.arcIdx = mid-1;
|
||||||
fst.readNextRealArc(arc);
|
fst.readNextRealArc(arc, in);
|
||||||
assert arc.arcIdx == mid;
|
assert arc.arcIdx == mid;
|
||||||
assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
|
assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
|
||||||
output[upto] = fst.outputs.add(output[upto-1], arc.output);
|
output[upto] = fst.outputs.add(output[upto-1], arc.output);
|
||||||
|
@ -185,7 +185,7 @@ abstract class FSTEnum<T> {
|
||||||
} else if (low == arc.numArcs) {
|
} else if (low == arc.numArcs) {
|
||||||
// Dead end
|
// Dead end
|
||||||
arc.arcIdx = arc.numArcs-2;
|
arc.arcIdx = arc.numArcs-2;
|
||||||
fst.readNextRealArc(arc);
|
fst.readNextRealArc(arc, in);
|
||||||
assert arc.isLast();
|
assert arc.isLast();
|
||||||
// Dead end (target is after the last arc);
|
// Dead end (target is after the last arc);
|
||||||
// rollback to last fork then push
|
// rollback to last fork then push
|
||||||
|
@ -205,7 +205,7 @@ abstract class FSTEnum<T> {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
arc.arcIdx = (low > high ? low : high)-1;
|
arc.arcIdx = (low > high ? low : high)-1;
|
||||||
fst.readNextRealArc(arc);
|
fst.readNextRealArc(arc, in);
|
||||||
assert arc.label > targetLabel;
|
assert arc.label > targetLabel;
|
||||||
pushFirst();
|
pushFirst();
|
||||||
return;
|
return;
|
||||||
|
@ -309,7 +309,7 @@ abstract class FSTEnum<T> {
|
||||||
// Match -- recurse
|
// Match -- recurse
|
||||||
//System.out.println(" match! arcIdx=" + mid);
|
//System.out.println(" match! arcIdx=" + mid);
|
||||||
arc.arcIdx = mid-1;
|
arc.arcIdx = mid-1;
|
||||||
fst.readNextRealArc(arc);
|
fst.readNextRealArc(arc, in);
|
||||||
assert arc.arcIdx == mid;
|
assert arc.arcIdx == mid;
|
||||||
assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
|
assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
|
||||||
output[upto] = fst.outputs.add(output[upto-1], arc.output);
|
output[upto] = fst.outputs.add(output[upto-1], arc.output);
|
||||||
|
@ -352,7 +352,7 @@ abstract class FSTEnum<T> {
|
||||||
// There is a floor arc:
|
// There is a floor arc:
|
||||||
arc.arcIdx = (low > high ? high : low)-1;
|
arc.arcIdx = (low > high ? high : low)-1;
|
||||||
//System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1));
|
//System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1));
|
||||||
fst.readNextRealArc(arc);
|
fst.readNextRealArc(arc, in);
|
||||||
assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel;
|
assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel;
|
||||||
assert arc.label < targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel;
|
assert arc.label < targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel;
|
||||||
pushLast();
|
pushLast();
|
||||||
|
|
|
@ -35,6 +35,7 @@ final class NodeHash<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address) throws IOException {
|
private boolean nodesEqual(Builder.UnCompiledNode<T> node, int address) throws IOException {
|
||||||
|
final FST<T>.BytesReader in = fst.getBytesReader(0);
|
||||||
fst.readFirstRealArc(address, scratchArc);
|
fst.readFirstRealArc(address, scratchArc);
|
||||||
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
|
if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -56,7 +57,7 @@ final class NodeHash<T> {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fst.readNextRealArc(scratchArc);
|
fst.readNextRealArc(scratchArc, in);
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
@ -87,6 +88,7 @@ final class NodeHash<T> {
|
||||||
// hash code for a frozen node
|
// hash code for a frozen node
|
||||||
private int hash(int node) throws IOException {
|
private int hash(int node) throws IOException {
|
||||||
final int PRIME = 31;
|
final int PRIME = 31;
|
||||||
|
final FST<T>.BytesReader in = fst.getBytesReader(0);
|
||||||
//System.out.println("hash frozen");
|
//System.out.println("hash frozen");
|
||||||
int h = 0;
|
int h = 0;
|
||||||
fst.readFirstRealArc(node, scratchArc);
|
fst.readFirstRealArc(node, scratchArc);
|
||||||
|
@ -102,7 +104,7 @@ final class NodeHash<T> {
|
||||||
if (scratchArc.isLast()) {
|
if (scratchArc.isLast()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
fst.readNextRealArc(scratchArc);
|
fst.readNextRealArc(scratchArc, in);
|
||||||
}
|
}
|
||||||
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
|
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
|
||||||
return h & Integer.MAX_VALUE;
|
return h & Integer.MAX_VALUE;
|
||||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -359,12 +356,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
</li>
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>
|
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a> ___________________ <em>javadoc-contrib-wordnet</em>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<ul>
|
<ul>
|
||||||
<li>
|
<li>
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a> ___________________ <em>javadoc-contrib-xml-query-parser</em>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a> ___________________ <em>javadoc-contrib-xml-query-parser</em>
|
||||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="../api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="../api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="../api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="../api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="../api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -263,9 +260,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="#spellchecker">spellchecker</a>
|
<a href="#spellchecker">spellchecker</a>
|
||||||
</li>
|
</li>
|
||||||
<li>
|
<li>
|
||||||
<a href="#wordnet">wordnet</a>
|
|
||||||
</li>
|
|
||||||
<li>
|
|
||||||
<a href="#xml-query-parser">xml-query-parser</a>
|
<a href="#xml-query-parser">xml-query-parser</a>
|
||||||
</li>
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
@ -375,12 +369,7 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<p>Provides tools for spellchecking and suggestions with Lucene.</p>
|
<p>Provides tools for spellchecking and suggestions with Lucene.</p>
|
||||||
<p>See <a href="../api/contrib-spellchecker/index.html">spellchecker javadoc</a>
|
<p>See <a href="../api/contrib-spellchecker/index.html">spellchecker javadoc</a>
|
||||||
</p>
|
</p>
|
||||||
<a name="N100DE"></a><a name="wordnet"></a>
|
<a name="N100DE"></a><a name="xml-query-parser"></a>
|
||||||
<h3 class="boxed">wordnet</h3>
|
|
||||||
<p>Tools to help utilize wordnet synonyms with Lucene</p>
|
|
||||||
<p>See <a href="../api/contrib-wordnet/index.html">wordnet javadoc</a>
|
|
||||||
</p>
|
|
||||||
<a name="N100ED"></a><a name="xml-query-parser"></a>
|
|
||||||
<h3 class="boxed">xml-query-parser</h3>
|
<h3 class="boxed">xml-query-parser</h3>
|
||||||
<p>A QueryParser that can read queries written in an XML format.</p>
|
<p>A QueryParser that can read queries written in an XML format.</p>
|
||||||
<p>See <a href="../api/contrib-wordnet/index.html">xml-query-parser javadoc</a>
|
<p>See <a href="../api/contrib-wordnet/index.html">xml-query-parser javadoc</a>
|
||||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -168,9 +168,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -166,9 +166,6 @@ document.write("Last Published: " + document.lastModified);
|
||||||
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
<a href="api/contrib-spellchecker/index.html">Spellchecker</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="menuitem">
|
<div class="menuitem">
|
||||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
|
||||||
</div>
|
|
||||||
<div class="menuitem">
|
|
||||||
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
<a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -106,11 +106,6 @@
|
||||||
<p>See <a href="../api/contrib-spellchecker/index.html">spellchecker javadoc</a></p>
|
<p>See <a href="../api/contrib-spellchecker/index.html">spellchecker javadoc</a></p>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<section id="wordnet"><title>wordnet</title>
|
|
||||||
<p>Tools to help utilize wordnet synonyms with Lucene</p>
|
|
||||||
<p>See <a href="../api/contrib-wordnet/index.html">wordnet javadoc</a></p>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<section id="xml-query-parser"><title>xml-query-parser</title>
|
<section id="xml-query-parser"><title>xml-query-parser</title>
|
||||||
<p>A QueryParser that can read queries written in an XML format.</p>
|
<p>A QueryParser that can read queries written in an XML format.</p>
|
||||||
<p>See <a href="../api/contrib-wordnet/index.html">xml-query-parser javadoc</a></p>
|
<p>See <a href="../api/contrib-wordnet/index.html">xml-query-parser javadoc</a></p>
|
||||||
|
|
|
@ -66,7 +66,6 @@ See http://forrest.apache.org/docs/linking.html for more info
|
||||||
<javadoc-contrib-remote label="Remote" href="ext:javadocs-contrib-remote"/>
|
<javadoc-contrib-remote label="Remote" href="ext:javadocs-contrib-remote"/>
|
||||||
<javadoc-contrib-spatial label="Spatial" href="ext:javadocs-contrib-spatial"/>
|
<javadoc-contrib-spatial label="Spatial" href="ext:javadocs-contrib-spatial"/>
|
||||||
<javadoc-contrib-spellchecker label="Spellchecker" href="ext:javadocs-contrib-spellchecker"/>
|
<javadoc-contrib-spellchecker label="Spellchecker" href="ext:javadocs-contrib-spellchecker"/>
|
||||||
<javadoc-contrib-wordnet label="Wordnet" href="ext:javadocs-contrib-wordnet"/>
|
|
||||||
<javadoc-contrib-xml-query-parser label="XML Query Parser" href="ext:javadocs-contrib-xml-query-parser"/>
|
<javadoc-contrib-xml-query-parser label="XML Query Parser" href="ext:javadocs-contrib-xml-query-parser"/>
|
||||||
</javadoc-contrib>
|
</javadoc-contrib>
|
||||||
</javadoc>
|
</javadoc>
|
||||||
|
@ -106,7 +105,6 @@ See http://forrest.apache.org/docs/linking.html for more info
|
||||||
<javadocs-contrib-remote href="api/contrib-remote/index.html"/>
|
<javadocs-contrib-remote href="api/contrib-remote/index.html"/>
|
||||||
<javadocs-contrib-spatial href="api/contrib-spatial/index.html"/>
|
<javadocs-contrib-spatial href="api/contrib-spatial/index.html"/>
|
||||||
<javadocs-contrib-spellchecker href="api/contrib-spellchecker/index.html"/>
|
<javadocs-contrib-spellchecker href="api/contrib-spellchecker/index.html"/>
|
||||||
<javadocs-contrib-wordnet href="api/contrib-wordnet/index.html"/>
|
|
||||||
<javadocs-contrib-xml-query-parser href="api/contrib-xml-query-parser/index.html"/>
|
<javadocs-contrib-xml-query-parser href="api/contrib-xml-query-parser/index.html"/>
|
||||||
|
|
||||||
<forrest href="http://forrest.apache.org/">
|
<forrest href="http://forrest.apache.org/">
|
||||||
|
|
|
@ -261,6 +261,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
text = _TestUtil.randomUnicodeString(random, maxWordLength);
|
text = _TestUtil.randomUnicodeString(random, maxWordLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||||
|
}
|
||||||
|
|
||||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text));
|
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(text));
|
||||||
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
|
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
|
||||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||||
|
@ -286,6 +290,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
ts.close();
|
ts.close();
|
||||||
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
||||||
if (!tokens.isEmpty()) {
|
if (!tokens.isEmpty()) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
|
||||||
|
}
|
||||||
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||||
// offset + pos + type
|
// offset + pos + type
|
||||||
assertAnalyzesToReuse(a, text,
|
assertAnalyzesToReuse(a, text,
|
||||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||||
|
import org.apache.lucene.index.codecs.CodecProvider;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
@ -166,6 +167,13 @@ public class TestIndexWriterCommit extends LuceneTestCase {
|
||||||
* measure max temp disk space used.
|
* measure max temp disk space used.
|
||||||
*/
|
*/
|
||||||
public void testCommitOnCloseDiskUsage() throws IOException {
|
public void testCommitOnCloseDiskUsage() throws IOException {
|
||||||
|
// MemoryCodec, since it uses FST, is not necessarily
|
||||||
|
// "additive", ie if you add up N small FSTs, then merge
|
||||||
|
// them, the merged result can easily be larger than the
|
||||||
|
// sum because the merged FST may use array encoding for
|
||||||
|
// some arcs (which uses more space):
|
||||||
|
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("id").equals("Memory"));
|
||||||
|
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("content").equals("Memory"));
|
||||||
MockDirectoryWrapper dir = newDirectory();
|
MockDirectoryWrapper dir = newDirectory();
|
||||||
Analyzer analyzer;
|
Analyzer analyzer;
|
||||||
if (random.nextBoolean()) {
|
if (random.nextBoolean()) {
|
||||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||||
|
import org.apache.lucene.index.codecs.CodecProvider;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
@ -142,6 +143,14 @@ public class TestIndexWriterOnDiskFull extends LuceneTestCase {
|
||||||
*/
|
*/
|
||||||
public void testAddIndexOnDiskFull() throws IOException
|
public void testAddIndexOnDiskFull() throws IOException
|
||||||
{
|
{
|
||||||
|
// MemoryCodec, since it uses FST, is not necessarily
|
||||||
|
// "additive", ie if you add up N small FSTs, then merge
|
||||||
|
// them, the merged result can easily be larger than the
|
||||||
|
// sum because the merged FST may use array encoding for
|
||||||
|
// some arcs (which uses more space):
|
||||||
|
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("id").equals("Memory"));
|
||||||
|
assumeFalse("This test cannot run with Memory codec", CodecProvider.getDefault().getFieldCodec("content").equals("Memory"));
|
||||||
|
|
||||||
int START_COUNT = 57;
|
int START_COUNT = 57;
|
||||||
int NUM_DIR = TEST_NIGHTLY ? 50 : 5;
|
int NUM_DIR = TEST_NIGHTLY ? 50 : 5;
|
||||||
int END_COUNT = START_COUNT + NUM_DIR* (TEST_NIGHTLY ? 25 : 5);
|
int END_COUNT = START_COUNT + NUM_DIR* (TEST_NIGHTLY ? 25 : 5);
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
package org.apache.lucene.util;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class TestCharsRef extends LuceneTestCase {
|
||||||
|
public void testUTF16InUTF8Order() {
|
||||||
|
final int numStrings = atLeast(1000);
|
||||||
|
BytesRef utf8[] = new BytesRef[numStrings];
|
||||||
|
CharsRef utf16[] = new CharsRef[numStrings];
|
||||||
|
|
||||||
|
for (int i = 0; i < numStrings; i++) {
|
||||||
|
String s = _TestUtil.randomUnicodeString(random);
|
||||||
|
utf8[i] = new BytesRef(s);
|
||||||
|
utf16[i] = new CharsRef(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
Arrays.sort(utf8);
|
||||||
|
Arrays.sort(utf16, CharsRef.getUTF16SortedAsUTF8Comparator());
|
||||||
|
|
||||||
|
for (int i = 0; i < numStrings; i++) {
|
||||||
|
assertEquals(utf8[i].utf8ToString(), utf16[i].toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,179 @@
|
||||||
|
package org.apache.lucene.analysis.synonym;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.LineNumberReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parser for the Solr synonyms format.
|
||||||
|
* <ol>
|
||||||
|
* <li> Blank lines and lines starting with '#' are comments.
|
||||||
|
* <li> Explicit mappings match any token sequence on the LHS of "=>"
|
||||||
|
* and replace with all alternatives on the RHS. These types of mappings
|
||||||
|
* ignore the expand parameter in the constructor.
|
||||||
|
* Example:
|
||||||
|
* <blockquote>i-pod, i pod => ipod</blockquote>
|
||||||
|
* <li> Equivalent synonyms may be separated with commas and give
|
||||||
|
* no explicit mapping. In this case the mapping behavior will
|
||||||
|
* be taken from the expand parameter in the constructor. This allows
|
||||||
|
* the same synonym file to be used in different synonym handling strategies.
|
||||||
|
* Example:
|
||||||
|
* <blockquote>ipod, i-pod, i pod</blockquote>
|
||||||
|
*
|
||||||
|
* <li> Multiple synonym mapping entries are merged.
|
||||||
|
* Example:
|
||||||
|
* <blockquote>
|
||||||
|
* foo => foo bar<br>
|
||||||
|
* foo => baz<br><br>
|
||||||
|
* is equivalent to<br><br>
|
||||||
|
* foo => foo bar, baz
|
||||||
|
* </blockquote>
|
||||||
|
* </ol>
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class SolrSynonymParser extends SynonymMap.Builder {
|
||||||
|
private final boolean expand;
|
||||||
|
private final Analyzer analyzer;
|
||||||
|
|
||||||
|
public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
|
||||||
|
super(dedup);
|
||||||
|
this.expand = expand;
|
||||||
|
this.analyzer = analyzer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(Reader in) throws IOException, ParseException {
|
||||||
|
LineNumberReader br = new LineNumberReader(in);
|
||||||
|
try {
|
||||||
|
addInternal(br);
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
|
||||||
|
ex.initCause(e);
|
||||||
|
throw ex;
|
||||||
|
} finally {
|
||||||
|
br.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addInternal(BufferedReader in) throws IOException {
|
||||||
|
String line = null;
|
||||||
|
while ((line = in.readLine()) != null) {
|
||||||
|
if (line.length() == 0 || line.charAt(0) == '#') {
|
||||||
|
continue; // ignore empty lines and comments
|
||||||
|
}
|
||||||
|
|
||||||
|
CharsRef inputs[];
|
||||||
|
CharsRef outputs[];
|
||||||
|
|
||||||
|
// TODO: we could process this more efficiently.
|
||||||
|
String sides[] = split(line, "=>");
|
||||||
|
if (sides.length > 1) { // explicit mapping
|
||||||
|
if (sides.length != 2) {
|
||||||
|
throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
|
||||||
|
}
|
||||||
|
String inputStrings[] = split(sides[0], ",");
|
||||||
|
inputs = new CharsRef[inputStrings.length];
|
||||||
|
for (int i = 0; i < inputs.length; i++) {
|
||||||
|
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
|
||||||
|
}
|
||||||
|
|
||||||
|
String outputStrings[] = split(sides[1], ",");
|
||||||
|
outputs = new CharsRef[outputStrings.length];
|
||||||
|
for (int i = 0; i < outputs.length; i++) {
|
||||||
|
outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
String inputStrings[] = split(line, ",");
|
||||||
|
inputs = new CharsRef[inputStrings.length];
|
||||||
|
for (int i = 0; i < inputs.length; i++) {
|
||||||
|
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
|
||||||
|
}
|
||||||
|
if (expand) {
|
||||||
|
outputs = inputs;
|
||||||
|
} else {
|
||||||
|
outputs = new CharsRef[1];
|
||||||
|
outputs[0] = inputs[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// currently we include the term itself in the map,
|
||||||
|
// and use includeOrig = false always.
|
||||||
|
// this is how the existing filter does it, but its actually a bug,
|
||||||
|
// especially if combined with ignoreCase = true
|
||||||
|
for (int i = 0; i < inputs.length; i++) {
|
||||||
|
for (int j = 0; j < outputs.length; j++) {
|
||||||
|
add(inputs[i], outputs[j], false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String[] split(String s, String separator) {
|
||||||
|
ArrayList<String> list = new ArrayList<String>(2);
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
int pos=0, end=s.length();
|
||||||
|
while (pos < end) {
|
||||||
|
if (s.startsWith(separator,pos)) {
|
||||||
|
if (sb.length() > 0) {
|
||||||
|
list.add(sb.toString());
|
||||||
|
sb=new StringBuilder();
|
||||||
|
}
|
||||||
|
pos+=separator.length();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
char ch = s.charAt(pos++);
|
||||||
|
if (ch=='\\') {
|
||||||
|
sb.append(ch);
|
||||||
|
if (pos>=end) break; // ERROR, or let it go?
|
||||||
|
ch = s.charAt(pos++);
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.append(ch);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sb.length() > 0) {
|
||||||
|
list.add(sb.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return list.toArray(new String[list.size()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String unescape(String s) {
|
||||||
|
if (s.indexOf("\\") >= 0) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (int i = 0; i < s.length(); i++) {
|
||||||
|
char ch = s.charAt(i);
|
||||||
|
if (ch == '\\' && i < s.length() - 1) {
|
||||||
|
sb.append(s.charAt(++i));
|
||||||
|
} else {
|
||||||
|
sb.append(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,3 +1,5 @@
|
||||||
|
package org.apache.lucene.analysis.synonym;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -15,245 +17,550 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.analysis.synonym;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
import java.io.IOException;
|
/**
|
||||||
import java.util.ArrayList;
|
* Matches single or multi word synonyms in a token stream.
|
||||||
import java.util.Iterator;
|
* This token stream cannot properly handle position
|
||||||
import java.util.LinkedList;
|
* increments != 1, ie, you should place this filter before
|
||||||
|
* filtering out stop words.
|
||||||
/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
|
*
|
||||||
* <p>
|
* <p>Note that with the current implementation, parsing is
|
||||||
* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
|
* greedy, so whenever multiple parses would apply, the rule
|
||||||
* or discarded. If the original tokens are included, the position increments may be modified
|
* starting the earliest and parsing the most tokens wins.
|
||||||
* to retain absolute positions after merging with the synonym tokenstream.
|
* For example if you have these rules:
|
||||||
* <p>
|
*
|
||||||
* Generated synonyms will start at the same position as the first matched source token.
|
* <pre>
|
||||||
|
* a -> x
|
||||||
|
* a b -> y
|
||||||
|
* b c d -> z
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* Then input <code>a b c d e</code> parses to <code>y b c
|
||||||
|
* d</code>, ie the 2nd rule "wins" because it started
|
||||||
|
* earliest and matched the most input tokens of other rules
|
||||||
|
* starting at that point.</p>
|
||||||
|
*
|
||||||
|
* <p>A future improvement to this filter could allow
|
||||||
|
* non-greedy parsing, such that the 3rd rule would win, and
|
||||||
|
* also separately allow multiple parses, such that all 3
|
||||||
|
* rules would match, perhaps even on a rule by rule
|
||||||
|
* basis.</p>
|
||||||
|
*
|
||||||
|
* <p><b>NOTE</b>: when a match occurs, the output tokens
|
||||||
|
* associated with the matching rule are "stacked" on top of
|
||||||
|
* the input stream (if the rule had
|
||||||
|
* <code>keepOrig=true</code>) and also on top of aother
|
||||||
|
* matched rule's output tokens. This is not a correct
|
||||||
|
* solution, as really the output should be an abitrary
|
||||||
|
* graph/lattice. For example, with the above match, you
|
||||||
|
* would expect an exact <code>PhraseQuery</code> <code>"y b
|
||||||
|
* c"</code> to match the parsed tokens, but it will fail to
|
||||||
|
* do so. This limitations is necessary because Lucene's
|
||||||
|
* TokenStream (and index) cannot yet represent an arbitrary
|
||||||
|
* graph.</p>
|
||||||
|
*
|
||||||
|
* <p><b>NOTE</b>: If multiple incoming tokens arrive on the
|
||||||
|
* same position, only the first token at that position is
|
||||||
|
* used for parsing. Subsequent tokens simply pass through
|
||||||
|
* and are not parsed. A future improvement would be to
|
||||||
|
* allow these tokens to also be matched.</p>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// TODO: maybe we should resolve token -> wordID then run
|
||||||
|
// FST on wordIDs, for better perf?
|
||||||
|
|
||||||
|
// TODO: a more efficient approach would be Aho/Corasick's
|
||||||
|
// algorithm
|
||||||
|
// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
|
||||||
|
// It improves over the current approach here
|
||||||
|
// because it does not fully re-start matching at every
|
||||||
|
// token. For exampl,e if one pattern is "a b c x"
|
||||||
|
// and another is "b c d" and the input is "a b c d", on
|
||||||
|
// trying to parse "a b c x" but failing when you got to x,
|
||||||
|
// rather than starting over again your really should
|
||||||
|
// immediately recognize that "b c d" matches at the next
|
||||||
|
// input. I suspect this won't matter that much in
|
||||||
|
// practice, but it's possible on some set of synonyms it
|
||||||
|
// will. We'd have to modify Aho/Corasick to enforce our
|
||||||
|
// conflict resolving (eg greedy matching) because that algo
|
||||||
|
// finds all matches.
|
||||||
|
|
||||||
public final class SynonymFilter extends TokenFilter {
|
public final class SynonymFilter extends TokenFilter {
|
||||||
|
|
||||||
private final SynonymMap map; // Map<String, SynonymMap>
|
public static final String TYPE_SYNONYM = "SYNONYM";
|
||||||
private Iterator<AttributeSource> replacement; // iterator over generated tokens
|
|
||||||
|
|
||||||
public SynonymFilter(TokenStream in, SynonymMap map) {
|
private final SynonymMap synonyms;
|
||||||
super(in);
|
|
||||||
if (map == null)
|
|
||||||
throw new IllegalArgumentException("map is required");
|
|
||||||
|
|
||||||
this.map = map;
|
private final boolean ignoreCase;
|
||||||
// just ensuring these attributes exist...
|
private final int rollBufferSize;
|
||||||
addAttribute(CharTermAttribute.class);
|
|
||||||
addAttribute(PositionIncrementAttribute.class);
|
private int captureCount;
|
||||||
addAttribute(OffsetAttribute.class);
|
|
||||||
addAttribute(TypeAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
// How many future input tokens have already been matched
|
||||||
|
// to a synonym; because the matching is "greedy" we don't
|
||||||
|
// try to do any more matching for such tokens:
|
||||||
|
private int inputSkipCount;
|
||||||
|
|
||||||
|
// Hold all buffered (read ahead) stacked input tokens for
|
||||||
|
// a future position. When multiple tokens are at the
|
||||||
|
// same position, we only store (and match against) the
|
||||||
|
// term for the first token at the position, but capture
|
||||||
|
// state for (and enumerate) all other tokens at this
|
||||||
|
// position:
|
||||||
|
private static class PendingInput {
|
||||||
|
final CharsRef term = new CharsRef();
|
||||||
|
AttributeSource.State state;
|
||||||
|
boolean keepOrig;
|
||||||
|
boolean consumed = true;
|
||||||
|
int startOffset;
|
||||||
|
int endOffset;
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
state = null;
|
||||||
|
consumed = true;
|
||||||
|
keepOrig = false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Rolling buffer, holding pending input tokens we had to
|
||||||
|
// clone because we needed to look ahead, indexed by
|
||||||
|
// position:
|
||||||
|
private final PendingInput[] futureInputs;
|
||||||
|
|
||||||
|
// Holds pending output synonyms for one future position:
|
||||||
|
private static class PendingOutputs {
|
||||||
|
CharsRef[] outputs;
|
||||||
|
int upto;
|
||||||
|
int count;
|
||||||
|
int posIncr = 1;
|
||||||
|
|
||||||
|
public PendingOutputs() {
|
||||||
|
outputs = new CharsRef[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
upto = count = 0;
|
||||||
|
posIncr = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CharsRef pullNext() {
|
||||||
|
assert upto < count;
|
||||||
|
final CharsRef result = outputs[upto++];
|
||||||
|
posIncr = 0;
|
||||||
|
if (upto == count) {
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(char[] output, int offset, int len) {
|
||||||
|
if (count == outputs.length) {
|
||||||
|
final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||||
|
System.arraycopy(outputs, 0, next, 0, count);
|
||||||
|
outputs = next;
|
||||||
|
}
|
||||||
|
if (outputs[count] == null) {
|
||||||
|
outputs[count] = new CharsRef();
|
||||||
|
}
|
||||||
|
outputs[count].copy(output, offset, len);
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
|
||||||
|
|
||||||
|
// Rolling buffer, holding stack of pending synonym
|
||||||
|
// outputs, indexed by position:
|
||||||
|
private final PendingOutputs[] futureOutputs;
|
||||||
|
|
||||||
|
// Where (in rolling buffers) to write next input saved state:
|
||||||
|
private int nextWrite;
|
||||||
|
|
||||||
|
// Where (in rolling buffers) to read next input saved state:
|
||||||
|
private int nextRead;
|
||||||
|
|
||||||
|
// True once we've read last token
|
||||||
|
private boolean finished;
|
||||||
|
|
||||||
|
private final FST.Arc<BytesRef> scratchArc;
|
||||||
|
|
||||||
|
private final FST<BytesRef> fst;
|
||||||
|
|
||||||
|
private final BytesRef scratchBytes = new BytesRef();
|
||||||
|
private final CharsRef scratchChars = new CharsRef();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param input input tokenstream
|
||||||
|
* @param synonyms synonym map
|
||||||
|
* @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
|
||||||
|
* Note, if you set this to true, its your responsibility to lowercase
|
||||||
|
* the input entries when you create the {@link SynonymMap}
|
||||||
|
*/
|
||||||
|
public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
|
||||||
|
super(input);
|
||||||
|
this.synonyms = synonyms;
|
||||||
|
this.ignoreCase = ignoreCase;
|
||||||
|
this.fst = synonyms.fst;
|
||||||
|
|
||||||
|
if (fst == null) {
|
||||||
|
throw new IllegalArgumentException("fst must be non-null");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Must be 1+ so that when roll buffer is at full
|
||||||
|
// lookahead we can distinguish this full buffer from
|
||||||
|
// the empty buffer:
|
||||||
|
rollBufferSize = 1+synonyms.maxHorizontalContext;
|
||||||
|
|
||||||
|
futureInputs = new PendingInput[rollBufferSize];
|
||||||
|
futureOutputs = new PendingOutputs[rollBufferSize];
|
||||||
|
for(int pos=0;pos<rollBufferSize;pos++) {
|
||||||
|
futureInputs[pos] = new PendingInput();
|
||||||
|
futureOutputs[pos] = new PendingOutputs();
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);
|
||||||
|
|
||||||
|
scratchArc = new FST.Arc<BytesRef>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void capture() {
|
||||||
|
captureCount++;
|
||||||
|
//System.out.println(" capture slot=" + nextWrite);
|
||||||
|
final PendingInput input = futureInputs[nextWrite];
|
||||||
|
|
||||||
|
input.state = captureState();
|
||||||
|
input.consumed = false;
|
||||||
|
input.term.copy(termAtt.buffer(), 0, termAtt.length());
|
||||||
|
|
||||||
|
nextWrite = rollIncr(nextWrite);
|
||||||
|
|
||||||
|
// Buffer head should never catch up to tail:
|
||||||
|
assert nextWrite != nextRead;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Need to worry about multiple scenarios:
|
This is the core of this TokenFilter: it locates the
|
||||||
* - need to go for the longest match
|
synonym matches and buffers up the results into
|
||||||
* a b => foo #shouldn't match if "a b" is followed by "c d"
|
futureInputs/Outputs.
|
||||||
* a b c d => bar
|
|
||||||
* - need to backtrack - retry matches for tokens already read
|
|
||||||
* a b c d => foo
|
|
||||||
* b c => bar
|
|
||||||
* If the input stream is "a b c x", one will consume "a b c d"
|
|
||||||
* trying to match the first rule... all but "a" should be
|
|
||||||
* pushed back so a match may be made on "b c".
|
|
||||||
* - don't try and match generated tokens (thus need separate queue)
|
|
||||||
* matching is not recursive.
|
|
||||||
* - handle optional generation of original tokens in all these cases,
|
|
||||||
* merging token streams to preserve token positions.
|
|
||||||
* - preserve original positionIncrement of first matched token
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean incrementToken() throws IOException {
|
|
||||||
while (true) {
|
|
||||||
// if there are any generated tokens, return them... don't try any
|
|
||||||
// matches against them, as we specifically don't want recursion.
|
|
||||||
if (replacement!=null && replacement.hasNext()) {
|
|
||||||
copy(this, replacement.next());
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// common case fast-path of first token not matching anything
|
NOTE: this calls input.incrementToken and does not
|
||||||
AttributeSource firstTok = nextTok();
|
capture the state if no further tokens were checked. So
|
||||||
if (firstTok == null) return false;
|
caller must then forward state to our caller, or capture:
|
||||||
CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
|
*/
|
||||||
SynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
|
|
||||||
if (result == null) {
|
|
||||||
copy(this, firstTok);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// fast-path failed, clone ourselves if needed
|
private void parse() throws IOException {
|
||||||
if (firstTok == this)
|
//System.out.println("\nS: parse");
|
||||||
firstTok = cloneAttributes();
|
|
||||||
// OK, we matched a token, so find the longest match.
|
|
||||||
|
|
||||||
matched = new LinkedList<AttributeSource>();
|
assert inputSkipCount == 0;
|
||||||
|
|
||||||
result = match(result);
|
int curNextRead = nextRead;
|
||||||
|
|
||||||
if (result==null) {
|
// Holds the longest match we've seen so far:
|
||||||
// no match, simply return the first token read.
|
BytesRef matchOutput = null;
|
||||||
copy(this, firstTok);
|
int matchInputLength = 0;
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// reuse, or create new one each time?
|
BytesRef pendingOutput = fst.outputs.getNoOutput();
|
||||||
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
|
fst.getFirstArc(scratchArc);
|
||||||
|
|
||||||
//
|
assert scratchArc.output == fst.outputs.getNoOutput();
|
||||||
// there was a match... let's generate the new tokens, merging
|
|
||||||
// in the matched tokens (position increments need adjusting)
|
|
||||||
//
|
|
||||||
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
|
|
||||||
boolean includeOrig = result.includeOrig();
|
|
||||||
|
|
||||||
AttributeSource origTok = includeOrig ? firstTok : null;
|
int tokenCount = 0;
|
||||||
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
|
|
||||||
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
|
|
||||||
int repPos=0; // curr position in replacement token stream
|
|
||||||
int pos=0; // current position in merged token stream
|
|
||||||
|
|
||||||
for (int i=0; i<result.synonyms.length; i++) {
|
byToken:
|
||||||
Token repTok = result.synonyms[i];
|
while(true) {
|
||||||
AttributeSource newTok = firstTok.cloneAttributes();
|
|
||||||
CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
|
|
||||||
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
|
|
||||||
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
|
|
||||||
|
|
||||||
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
|
// Pull next token's chars:
|
||||||
|
final char[] buffer;
|
||||||
|
final int bufferLen;
|
||||||
|
//System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
|
||||||
|
|
||||||
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
if (curNextRead == nextWrite) {
|
||||||
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
|
|
||||||
repPos += repTok.getPositionIncrement();
|
|
||||||
if (i==0) repPos=origPos; // make position of first token equal to original
|
|
||||||
|
|
||||||
// if necessary, insert original tokens and adjust position increment
|
// We used up our lookahead buffer of input tokens
|
||||||
while (origTok != null && origPos <= repPos) {
|
// -- pull next real input token:
|
||||||
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
|
||||||
origPosInc.setPositionIncrement(origPos-pos);
|
if (finished) {
|
||||||
generated.add(origTok);
|
break;
|
||||||
pos += origPosInc.getPositionIncrement();
|
} else {
|
||||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
//System.out.println(" input.incrToken");
|
||||||
if (origTok != null) {
|
assert futureInputs[nextWrite].consumed;
|
||||||
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
// Not correct: a syn match whose output is longer
|
||||||
origPos += origPosInc.getPositionIncrement();
|
// than its input can set future inputs keepOrig
|
||||||
|
// to true:
|
||||||
|
//assert !futureInputs[nextWrite].keepOrig;
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
buffer = termAtt.buffer();
|
||||||
|
bufferLen = termAtt.length();
|
||||||
|
final PendingInput input = futureInputs[nextWrite];
|
||||||
|
input.startOffset = offsetAtt.startOffset();
|
||||||
|
input.endOffset = offsetAtt.endOffset();
|
||||||
|
//System.out.println(" new token=" + new String(buffer, 0, bufferLen));
|
||||||
|
if (nextRead != nextWrite) {
|
||||||
|
capture();
|
||||||
|
} else {
|
||||||
|
input.consumed = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// No more input tokens
|
||||||
|
//System.out.println(" set end");
|
||||||
|
finished = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
newPosIncAtt.setPositionIncrement(repPos - pos);
|
|
||||||
generated.add(newTok);
|
|
||||||
pos += newPosIncAtt.getPositionIncrement();
|
|
||||||
}
|
|
||||||
|
|
||||||
// finish up any leftover original tokens
|
|
||||||
while (origTok!=null) {
|
|
||||||
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
|
||||||
origPosInc.setPositionIncrement(origPos-pos);
|
|
||||||
generated.add(origTok);
|
|
||||||
pos += origPosInc.getPositionIncrement();
|
|
||||||
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
|
||||||
if (origTok != null) {
|
|
||||||
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
|
||||||
origPos += origPosInc.getPositionIncrement();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// what if we replaced a longer sequence with a shorter one?
|
|
||||||
// a/0 b/5 => foo/0
|
|
||||||
// should I re-create the gap on the next buffered token?
|
|
||||||
|
|
||||||
replacement = generated.iterator();
|
|
||||||
// Now return to the top of the loop to read and return the first
|
|
||||||
// generated token.. The reason this is done is that we may have generated
|
|
||||||
// nothing at all, and may need to continue with more matching logic.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
// Defer creation of the buffer until the first time it is used to
|
|
||||||
// optimize short fields with no matches.
|
|
||||||
//
|
|
||||||
private LinkedList<AttributeSource> buffer;
|
|
||||||
private LinkedList<AttributeSource> matched;
|
|
||||||
|
|
||||||
private boolean exhausted;
|
|
||||||
|
|
||||||
private AttributeSource nextTok() throws IOException {
|
|
||||||
if (buffer!=null && !buffer.isEmpty()) {
|
|
||||||
return buffer.removeFirst();
|
|
||||||
} else {
|
|
||||||
if (!exhausted && input.incrementToken()) {
|
|
||||||
return this;
|
|
||||||
} else {
|
} else {
|
||||||
exhausted = true;
|
// Still in our lookahead
|
||||||
return null;
|
buffer = futureInputs[curNextRead].term.chars;
|
||||||
|
bufferLen = futureInputs[curNextRead].term.length;
|
||||||
|
//System.out.println(" old token=" + new String(buffer, 0, bufferLen));
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenCount++;
|
||||||
|
|
||||||
|
// Run each char in this token through the FST:
|
||||||
|
int bufUpto = 0;
|
||||||
|
while(bufUpto < bufferLen) {
|
||||||
|
final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
|
||||||
|
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) {
|
||||||
|
//System.out.println(" stop");
|
||||||
|
break byToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accum the output
|
||||||
|
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
|
||||||
|
//System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
|
||||||
|
bufUpto += Character.charCount(codePoint);
|
||||||
|
}
|
||||||
|
|
||||||
|
// OK, entire token matched; now see if this is a final
|
||||||
|
// state:
|
||||||
|
if (scratchArc.isFinal()) {
|
||||||
|
matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
|
||||||
|
matchInputLength = tokenCount;
|
||||||
|
//System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
|
||||||
|
}
|
||||||
|
|
||||||
|
// See if the FST wants to continue matching (ie, needs to
|
||||||
|
// see the next input token):
|
||||||
|
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
|
||||||
|
// No further rules can match here; we're done
|
||||||
|
// searching for matching rules starting at the
|
||||||
|
// current input position.
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
// More matching is possible -- accum the output (if
|
||||||
|
// any) of the WORD_SEP arc:
|
||||||
|
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
|
||||||
|
if (nextRead == nextWrite) {
|
||||||
|
capture();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
curNextRead = rollIncr(curNextRead);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nextRead == nextWrite && !finished) {
|
||||||
|
//System.out.println(" skip write slot=" + nextWrite);
|
||||||
|
nextWrite = rollIncr(nextWrite);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (matchOutput != null) {
|
||||||
|
//System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
|
||||||
|
inputSkipCount = matchInputLength;
|
||||||
|
addOutput(matchOutput);
|
||||||
|
} else if (nextRead != nextWrite) {
|
||||||
|
// Even though we had no match here, we set to 1
|
||||||
|
// because we need to skip current input token before
|
||||||
|
// trying to match again:
|
||||||
|
inputSkipCount = 1;
|
||||||
|
} else {
|
||||||
|
assert finished;
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Interleaves all output tokens onto the futureOutputs:
|
||||||
|
private void addOutput(BytesRef bytes) {
|
||||||
|
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
|
||||||
|
|
||||||
|
final int code = bytesReader.readVInt();
|
||||||
|
final boolean keepOrig = (code & 0x1) == 0;
|
||||||
|
final int count = code >>> 1;
|
||||||
|
//System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig);
|
||||||
|
for(int outputIDX=0;outputIDX<count;outputIDX++) {
|
||||||
|
synonyms.words.get(bytesReader.readVInt(),
|
||||||
|
scratchBytes);
|
||||||
|
//System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
|
||||||
|
UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
|
||||||
|
int lastStart = scratchChars.offset;
|
||||||
|
final int chEnd = lastStart + scratchChars.length;
|
||||||
|
int outputUpto = nextRead;
|
||||||
|
for(int chIDX=lastStart;chIDX<=chEnd;chIDX++) {
|
||||||
|
if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
|
||||||
|
final int outputLen = chIDX - lastStart;
|
||||||
|
// Caller is not allowed to have empty string in
|
||||||
|
// the output:
|
||||||
|
assert outputLen > 0: "output contains empty string: " + scratchChars;
|
||||||
|
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
|
||||||
|
//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
|
||||||
|
lastStart = 1+chIDX;
|
||||||
|
futureInputs[outputUpto].keepOrig |= keepOrig;
|
||||||
|
//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
|
||||||
|
outputUpto = rollIncr(outputUpto);
|
||||||
|
assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void pushTok(AttributeSource t) {
|
// ++ mod rollBufferSize
|
||||||
if (buffer==null) buffer=new LinkedList<AttributeSource>();
|
private int rollIncr(int count) {
|
||||||
buffer.addFirst(t);
|
count++;
|
||||||
|
if (count == rollBufferSize) {
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
return count;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private SynonymMap match(SynonymMap map) throws IOException {
|
// for testing
|
||||||
SynonymMap result = null;
|
int getCaptureCount() {
|
||||||
|
return captureCount;
|
||||||
|
}
|
||||||
|
|
||||||
if (map.submap != null) {
|
@Override
|
||||||
AttributeSource tok = nextTok();
|
public boolean incrementToken() throws IOException {
|
||||||
if (tok != null) {
|
|
||||||
// clone ourselves.
|
|
||||||
if (tok == this)
|
|
||||||
tok = cloneAttributes();
|
|
||||||
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
|
|
||||||
CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
|
|
||||||
SynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
|
|
||||||
|
|
||||||
if (subMap != null) {
|
//System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
|
||||||
// recurse
|
|
||||||
result = match(subMap);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (result != null) {
|
while(true) {
|
||||||
matched.addFirst(tok);
|
|
||||||
|
// First play back any buffered future inputs/outputs
|
||||||
|
// w/o running parsing again:
|
||||||
|
while (inputSkipCount != 0) {
|
||||||
|
|
||||||
|
// At each position, we first output the original
|
||||||
|
// token
|
||||||
|
|
||||||
|
// TODO: maybe just a PendingState class, holding
|
||||||
|
// both input & outputs?
|
||||||
|
final PendingInput input = futureInputs[nextRead];
|
||||||
|
final PendingOutputs outputs = futureOutputs[nextRead];
|
||||||
|
|
||||||
|
//System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);
|
||||||
|
|
||||||
|
if (!input.consumed && (input.keepOrig || outputs.count == 0)) {
|
||||||
|
if (input.state != null) {
|
||||||
|
// Return a previously saved token (because we
|
||||||
|
// had to lookahead):
|
||||||
|
restoreState(input.state);
|
||||||
|
} else {
|
||||||
|
// Pass-through case: return token we just pulled
|
||||||
|
// but didn't capture:
|
||||||
|
assert inputSkipCount == 1: "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
|
||||||
|
}
|
||||||
|
input.reset();
|
||||||
|
if (outputs.count > 0) {
|
||||||
|
outputs.posIncr = 0;
|
||||||
|
} else {
|
||||||
|
nextRead = rollIncr(nextRead);
|
||||||
|
inputSkipCount--;
|
||||||
|
}
|
||||||
|
//System.out.println(" return token=" + termAtt.toString());
|
||||||
|
return true;
|
||||||
|
} else if (outputs.upto < outputs.count) {
|
||||||
|
// Still have pending outputs to replay at this
|
||||||
|
// position
|
||||||
|
input.reset();
|
||||||
|
final int posIncr = outputs.posIncr;
|
||||||
|
final CharsRef output = outputs.pullNext();
|
||||||
|
clearAttributes();
|
||||||
|
termAtt.copyBuffer(output.chars, output.offset, output.length);
|
||||||
|
typeAtt.setType(TYPE_SYNONYM);
|
||||||
|
offsetAtt.setOffset(input.startOffset, input.endOffset);
|
||||||
|
posIncrAtt.setPositionIncrement(posIncr);
|
||||||
|
if (outputs.count == 0) {
|
||||||
|
// Done with the buffered input and all outputs at
|
||||||
|
// this position
|
||||||
|
nextRead = rollIncr(nextRead);
|
||||||
|
inputSkipCount--;
|
||||||
|
}
|
||||||
|
//System.out.println(" return token=" + termAtt.toString());
|
||||||
|
return true;
|
||||||
} else {
|
} else {
|
||||||
// push back unmatched token
|
// Done with the buffered input and all outputs at
|
||||||
pushTok(tok);
|
// this position
|
||||||
|
input.reset();
|
||||||
|
nextRead = rollIncr(nextRead);
|
||||||
|
inputSkipCount--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (finished && nextRead == nextWrite) {
|
||||||
|
// End case: if any output syns went beyond end of
|
||||||
|
// input stream, enumerate them now:
|
||||||
|
final PendingOutputs outputs = futureOutputs[nextRead];
|
||||||
|
if (outputs.upto < outputs.count) {
|
||||||
|
final int posIncr = outputs.posIncr;
|
||||||
|
final CharsRef output = outputs.pullNext();
|
||||||
|
futureInputs[nextRead].reset();
|
||||||
|
if (outputs.count == 0) {
|
||||||
|
nextWrite = nextRead = rollIncr(nextRead);
|
||||||
|
}
|
||||||
|
clearAttributes();
|
||||||
|
termAtt.copyBuffer(output.chars, output.offset, output.length);
|
||||||
|
typeAtt.setType(TYPE_SYNONYM);
|
||||||
|
//System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs);
|
||||||
|
posIncrAtt.setPositionIncrement(posIncr);
|
||||||
|
//System.out.println(" return token=" + termAtt.toString());
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find new synonym matches:
|
||||||
|
parse();
|
||||||
}
|
}
|
||||||
|
|
||||||
// if no longer sequence matched, so if this node has synonyms, it's the match.
|
|
||||||
if (result==null && map.synonyms!=null) {
|
|
||||||
result = map;
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void copy(AttributeSource target, AttributeSource source) {
|
|
||||||
if (target != source)
|
|
||||||
source.copyTo(target);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
input.reset();
|
super.reset();
|
||||||
replacement = null;
|
captureCount = 0;
|
||||||
exhausted = false;
|
finished = false;
|
||||||
|
|
||||||
|
// In normal usage these resets would not be needed,
|
||||||
|
// since they reset-as-they-are-consumed, but the app
|
||||||
|
// may not consume all input tokens in which case we
|
||||||
|
// have leftover state here:
|
||||||
|
for (PendingInput input : futureInputs) {
|
||||||
|
input.reset();
|
||||||
|
}
|
||||||
|
for (PendingOutputs output : futureOutputs) {
|
||||||
|
output.reset();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
package org.apache.lucene.analysis.synonym;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -15,146 +17,301 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.analysis.synonym;
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefHash;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||||
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
import java.util.*;
|
/**
|
||||||
|
* A map of synonyms, keys and values are phrases.
|
||||||
/** Mapping rules for use with {@link SynonymFilter}
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class SynonymMap {
|
public class SynonymMap {
|
||||||
/** @lucene.internal */
|
/** for multiword support, you must separate words with this separator */
|
||||||
public CharArrayMap<SynonymMap> submap; // recursive: Map<String, SynonymMap>
|
public static final char WORD_SEPARATOR = 0;
|
||||||
/** @lucene.internal */
|
/** map<input word, list<ord>> */
|
||||||
public Token[] synonyms;
|
public final FST<BytesRef> fst;
|
||||||
int flags;
|
/** map<ord, outputword> */
|
||||||
|
public final BytesRefHash words;
|
||||||
|
/** maxHorizontalContext: maximum context we need on the tokenstream */
|
||||||
|
public final int maxHorizontalContext;
|
||||||
|
|
||||||
static final int INCLUDE_ORIG=0x01;
|
public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext) {
|
||||||
static final int IGNORE_CASE=0x02;
|
this.fst = fst;
|
||||||
|
this.words = words;
|
||||||
public SynonymMap() {}
|
this.maxHorizontalContext = maxHorizontalContext;
|
||||||
public SynonymMap(boolean ignoreCase) {
|
|
||||||
if (ignoreCase) flags |= IGNORE_CASE;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean includeOrig() { return (flags & INCLUDE_ORIG) != 0; }
|
|
||||||
public boolean ignoreCase() { return (flags & IGNORE_CASE) != 0; }
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param singleMatch List<String>, the sequence of strings to match
|
* Builds an FSTSynonymMap.
|
||||||
* @param replacement List<Token> the list of tokens to use on a match
|
* <p>
|
||||||
* @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
|
* Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap
|
||||||
* @param mergeExisting merge the replacement tokens with any other mappings that exist
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
|
public static class Builder {
|
||||||
SynonymMap currMap = this;
|
private final HashMap<CharsRef,MapEntry> workingSet = new HashMap<CharsRef,MapEntry>();
|
||||||
for (String str : singleMatch) {
|
private final BytesRefHash words = new BytesRefHash();
|
||||||
if (currMap.submap==null) {
|
private final BytesRef utf8Scratch = new BytesRef(8);
|
||||||
// for now hardcode at 4.0, as its what the old code did.
|
private int maxHorizontalContext;
|
||||||
// would be nice to fix, but shouldn't store a version in each submap!!!
|
private final boolean dedup;
|
||||||
currMap.submap = new CharArrayMap<SynonymMap>(Version.LUCENE_40, 1, ignoreCase());
|
|
||||||
}
|
|
||||||
|
|
||||||
SynonymMap map = currMap.submap.get(str);
|
/** If dedup is true then identical rules (same input,
|
||||||
if (map==null) {
|
* same output) will be added only once. */
|
||||||
map = new SynonymMap();
|
public Builder(boolean dedup) {
|
||||||
map.flags |= flags & IGNORE_CASE;
|
this.dedup = dedup;
|
||||||
currMap.submap.put(str, map);
|
|
||||||
}
|
|
||||||
|
|
||||||
currMap = map;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (currMap.synonyms != null && !mergeExisting) {
|
private static class MapEntry {
|
||||||
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
|
boolean includeOrig;
|
||||||
|
// we could sort for better sharing ultimately, but it could confuse people
|
||||||
|
ArrayList<Integer> ords = new ArrayList<Integer>();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Sugar: just joins the provided terms with {@link
|
||||||
|
* SynonymMap#WORD_SEPARATOR}. reuse and its chars
|
||||||
|
* must not be null. */
|
||||||
|
public static CharsRef join(String[] words, CharsRef reuse) {
|
||||||
|
int upto = 0;
|
||||||
|
char[] buffer = reuse.chars;
|
||||||
|
for(String word : words) {
|
||||||
|
if (upto > 0) {
|
||||||
|
if (upto >= buffer.length) {
|
||||||
|
reuse.grow(upto);
|
||||||
|
buffer = reuse.chars;
|
||||||
|
}
|
||||||
|
buffer[upto++] = SynonymMap.WORD_SEPARATOR;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int wordLen = word.length();
|
||||||
|
final int needed = upto + wordLen;
|
||||||
|
if (needed > buffer.length) {
|
||||||
|
reuse.grow(needed);
|
||||||
|
buffer = reuse.chars;
|
||||||
|
}
|
||||||
|
|
||||||
|
word.getChars(0, wordLen, buffer, upto);
|
||||||
|
upto += wordLen;
|
||||||
|
}
|
||||||
|
|
||||||
|
return reuse;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Sugar: analyzes the text with the analyzer and
|
||||||
|
* separates by {@link SynonymMap#WORD_SEPARATOR}.
|
||||||
|
* reuse and its chars must not be null. */
|
||||||
|
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
|
||||||
|
TokenStream ts = analyzer.reusableTokenStream("", new StringReader(text));
|
||||||
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
ts.reset();
|
||||||
|
reuse.length = 0;
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
int length = termAtt.length();
|
||||||
|
if (length == 0) {
|
||||||
|
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
|
||||||
|
}
|
||||||
|
if (posIncAtt.getPositionIncrement() != 1) {
|
||||||
|
throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
|
||||||
|
}
|
||||||
|
reuse.grow(reuse.length + length + 1); /* current + word + separator */
|
||||||
|
int end = reuse.offset + reuse.length;
|
||||||
|
if (reuse.length > 0) {
|
||||||
|
reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
|
||||||
|
reuse.length++;
|
||||||
|
}
|
||||||
|
System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
|
||||||
|
reuse.length += length;
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
|
ts.close();
|
||||||
|
if (reuse.length == 0) {
|
||||||
|
throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
|
||||||
|
}
|
||||||
|
return reuse;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** only used for asserting! */
|
||||||
|
private boolean hasHoles(CharsRef chars) {
|
||||||
|
final int end = chars.offset + chars.length;
|
||||||
|
for(int idx=chars.offset+1;idx<end;idx++) {
|
||||||
|
if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (chars.chars[chars.offset] == '\u0000') {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (chars.chars[chars.offset + chars.length - 1] == '\u0000') {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: while it's tempting to make this public, since
|
||||||
|
// caller's parser likely knows the
|
||||||
|
// numInput/numOutputWords, sneaky exceptions, much later
|
||||||
|
// on, will result if these values are wrong; so we always
|
||||||
|
// recompute ourselves to be safe:
|
||||||
|
private void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) {
|
||||||
|
// first convert to UTF-8
|
||||||
|
if (numInputWords <= 0) {
|
||||||
|
throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
|
||||||
|
}
|
||||||
|
if (input.length <= 0) {
|
||||||
|
throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")");
|
||||||
|
}
|
||||||
|
if (numOutputWords <= 0) {
|
||||||
|
throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
|
||||||
|
}
|
||||||
|
if (output.length <= 0) {
|
||||||
|
throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")");
|
||||||
|
}
|
||||||
|
|
||||||
|
assert !hasHoles(input): "input has holes: " + input;
|
||||||
|
assert !hasHoles(output): "output has holes: " + output;
|
||||||
|
|
||||||
|
//System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
|
||||||
|
final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch);
|
||||||
|
// lookup in hash
|
||||||
|
int ord = words.add(utf8Scratch, hashCode);
|
||||||
|
if (ord < 0) {
|
||||||
|
// already exists in our hash
|
||||||
|
ord = (-ord)-1;
|
||||||
|
//System.out.println(" output=" + output + " old ord=" + ord);
|
||||||
|
} else {
|
||||||
|
//System.out.println(" output=" + output + " new ord=" + ord);
|
||||||
|
}
|
||||||
|
|
||||||
|
MapEntry e = workingSet.get(input);
|
||||||
|
if (e == null) {
|
||||||
|
e = new MapEntry();
|
||||||
|
workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map
|
||||||
|
}
|
||||||
|
|
||||||
|
e.ords.add(ord);
|
||||||
|
e.includeOrig |= includeOrig;
|
||||||
|
maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords);
|
||||||
|
maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int countWords(CharsRef chars) {
|
||||||
|
int wordCount = 1;
|
||||||
|
int upto = chars.offset;
|
||||||
|
final int limit = chars.offset + chars.length;
|
||||||
|
while(upto < limit) {
|
||||||
|
if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) {
|
||||||
|
wordCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return wordCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a phrase->phrase synonym mapping.
|
||||||
|
* Phrases are character sequences where words are
|
||||||
|
* separated with character zero (\u0000). Empty words
|
||||||
|
* (two \u0000s in a row) are not allowed in the input nor
|
||||||
|
* the output!
|
||||||
|
*
|
||||||
|
* @param input input phrase
|
||||||
|
* @param output output phrase
|
||||||
|
* @param includeOrig true if the original should be included
|
||||||
|
*/
|
||||||
|
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
|
||||||
|
add(input, countWords(input), output, countWords(output), includeOrig);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an {@link SynonymMap} and returns it.
|
||||||
|
*/
|
||||||
|
public SynonymMap build() throws IOException {
|
||||||
|
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
|
// TODO: are we using the best sharing options?
|
||||||
|
org.apache.lucene.util.fst.Builder<BytesRef> builder =
|
||||||
|
new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
|
||||||
|
|
||||||
|
BytesRef scratch = new BytesRef(64);
|
||||||
|
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
|
||||||
|
|
||||||
|
final Set<Integer> dedupSet;
|
||||||
|
|
||||||
|
if (dedup) {
|
||||||
|
dedupSet = new HashSet<Integer>();
|
||||||
|
} else {
|
||||||
|
dedupSet = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final byte[] spare = new byte[5];
|
||||||
|
|
||||||
|
Set<CharsRef> keys = workingSet.keySet();
|
||||||
|
CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
|
||||||
|
Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
|
||||||
|
|
||||||
|
//System.out.println("fmap.build");
|
||||||
|
for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
|
||||||
|
CharsRef input = sortedKeys[keyIdx];
|
||||||
|
MapEntry output = workingSet.get(input);
|
||||||
|
|
||||||
|
int numEntries = output.ords.size();
|
||||||
|
// output size, assume the worst case
|
||||||
|
int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
|
||||||
|
|
||||||
|
scratch.grow(estimatedSize);
|
||||||
|
scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
|
||||||
|
assert scratch.offset == 0;
|
||||||
|
|
||||||
|
// now write our output data:
|
||||||
|
int count = 0;
|
||||||
|
for (int i = 0; i < numEntries; i++) {
|
||||||
|
if (dedupSet != null) {
|
||||||
|
// box once
|
||||||
|
final Integer ent = output.ords.get(i);
|
||||||
|
if (dedupSet.contains(ent)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
dedupSet.add(ent);
|
||||||
|
}
|
||||||
|
scratchOutput.writeVInt(output.ords.get(i));
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int pos = scratchOutput.getPosition();
|
||||||
|
scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
|
||||||
|
final int pos2 = scratchOutput.getPosition();
|
||||||
|
final int vIntLen = pos2-pos;
|
||||||
|
|
||||||
|
// Move the count + includeOrig to the front of the byte[]:
|
||||||
|
System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen);
|
||||||
|
System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
|
||||||
|
System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen);
|
||||||
|
|
||||||
|
if (dedupSet != null) {
|
||||||
|
dedupSet.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
scratch.length = scratchOutput.getPosition() - scratch.offset;
|
||||||
|
//System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
|
||||||
|
builder.add(input, new BytesRef(scratch));
|
||||||
|
}
|
||||||
|
|
||||||
|
FST<BytesRef> fst = builder.finish();
|
||||||
|
return new SynonymMap(fst, words, maxHorizontalContext);
|
||||||
}
|
}
|
||||||
List<Token> superset = currMap.synonyms==null ? replacement :
|
|
||||||
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
|
|
||||||
currMap.synonyms = superset.toArray(new Token[superset.size()]);
|
|
||||||
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
StringBuilder sb = new StringBuilder("<");
|
|
||||||
if (synonyms!=null) {
|
|
||||||
sb.append("[");
|
|
||||||
for (int i=0; i<synonyms.length; i++) {
|
|
||||||
if (i!=0) sb.append(',');
|
|
||||||
sb.append(synonyms[i]);
|
|
||||||
}
|
|
||||||
if ((flags & INCLUDE_ORIG)!=0) {
|
|
||||||
sb.append(",ORIG");
|
|
||||||
}
|
|
||||||
sb.append("],");
|
|
||||||
}
|
|
||||||
sb.append(submap);
|
|
||||||
sb.append(">");
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/** Produces a List<Token> from a List<String> */
|
|
||||||
public static List<Token> makeTokens(List<String> strings) {
|
|
||||||
List<Token> ret = new ArrayList<Token>(strings.size());
|
|
||||||
for (String str : strings) {
|
|
||||||
//Token newTok = new Token(str,0,0,"SYNONYM");
|
|
||||||
Token newTok = new Token(str, 0,0,"SYNONYM");
|
|
||||||
ret.add(newTok);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
|
|
||||||
* the tokens end up at the same position.
|
|
||||||
*
|
|
||||||
* Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
|
|
||||||
* Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
|
|
||||||
ArrayList<Token> result = new ArrayList<Token>();
|
|
||||||
if (lst1 ==null || lst2 ==null) {
|
|
||||||
if (lst2 != null) result.addAll(lst2);
|
|
||||||
if (lst1 != null) result.addAll(lst1);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
int pos=0;
|
|
||||||
Iterator<Token> iter1=lst1.iterator();
|
|
||||||
Iterator<Token> iter2=lst2.iterator();
|
|
||||||
Token tok1 = iter1.hasNext() ? iter1.next() : null;
|
|
||||||
Token tok2 = iter2.hasNext() ? iter2.next() : null;
|
|
||||||
int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0;
|
|
||||||
int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0;
|
|
||||||
while(tok1!=null || tok2!=null) {
|
|
||||||
while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
|
|
||||||
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
|
|
||||||
tok.copyBuffer(tok1.buffer(), 0, tok1.length());
|
|
||||||
tok.setPositionIncrement(pos1-pos);
|
|
||||||
result.add(tok);
|
|
||||||
pos=pos1;
|
|
||||||
tok1 = iter1.hasNext() ? iter1.next() : null;
|
|
||||||
pos1 += tok1!=null ? tok1.getPositionIncrement() : 0;
|
|
||||||
}
|
|
||||||
while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
|
|
||||||
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
|
|
||||||
tok.copyBuffer(tok2.buffer(), 0, tok2.length());
|
|
||||||
tok.setPositionIncrement(pos2-pos);
|
|
||||||
result.add(tok);
|
|
||||||
pos=pos2;
|
|
||||||
tok2 = iter2.hasNext() ? iter2.next() : null;
|
|
||||||
pos2 += tok2!=null ? tok2.getPositionIncrement() : 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,112 @@
|
||||||
|
package org.apache.lucene.analysis.synonym;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.LineNumberReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.text.ParseException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parser for wordnet prolog format
|
||||||
|
* <p>
|
||||||
|
* See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format.
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
|
||||||
|
public class WordnetSynonymParser extends SynonymMap.Builder {
|
||||||
|
private final boolean expand;
|
||||||
|
private final Analyzer analyzer;
|
||||||
|
|
||||||
|
public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
|
||||||
|
super(dedup);
|
||||||
|
this.expand = expand;
|
||||||
|
this.analyzer = analyzer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(Reader in) throws IOException, ParseException {
|
||||||
|
LineNumberReader br = new LineNumberReader(in);
|
||||||
|
try {
|
||||||
|
String line = null;
|
||||||
|
String lastSynSetID = "";
|
||||||
|
CharsRef synset[] = new CharsRef[8];
|
||||||
|
int synsetSize = 0;
|
||||||
|
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
String synSetID = line.substring(2, 11);
|
||||||
|
|
||||||
|
if (!synSetID.equals(lastSynSetID)) {
|
||||||
|
addInternal(synset, synsetSize);
|
||||||
|
synsetSize = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (synset.length <= synsetSize+1) {
|
||||||
|
CharsRef larger[] = new CharsRef[synset.length * 2];
|
||||||
|
System.arraycopy(synset, 0, larger, 0, synsetSize);
|
||||||
|
synset = larger;
|
||||||
|
}
|
||||||
|
|
||||||
|
synset[synsetSize] = parseSynonym(line, synset[synsetSize]);
|
||||||
|
synsetSize++;
|
||||||
|
lastSynSetID = synSetID;
|
||||||
|
}
|
||||||
|
|
||||||
|
// final synset in the file
|
||||||
|
addInternal(synset, synsetSize);
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
|
||||||
|
ex.initCause(e);
|
||||||
|
throw ex;
|
||||||
|
} finally {
|
||||||
|
br.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private CharsRef parseSynonym(String line, CharsRef reuse) throws IOException {
|
||||||
|
if (reuse == null) {
|
||||||
|
reuse = new CharsRef(8);
|
||||||
|
}
|
||||||
|
|
||||||
|
int start = line.indexOf('\'')+1;
|
||||||
|
int end = line.lastIndexOf('\'');
|
||||||
|
|
||||||
|
String text = line.substring(start, end).replace("''", "'");
|
||||||
|
return analyze(analyzer, text, reuse);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addInternal(CharsRef synset[], int size) throws IOException {
|
||||||
|
if (size <= 1) {
|
||||||
|
return; // nothing to do
|
||||||
|
}
|
||||||
|
|
||||||
|
if (expand) {
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
for (int j = 0; j < size; j++) {
|
||||||
|
add(synset[i], synset[j], false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
add(synset[i], synset[0], false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
<!--
|
<!--
|
||||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -14,13 +15,8 @@
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
-->
|
-->
|
||||||
<html>
|
<html><head></head>
|
||||||
<head>
|
<body>
|
||||||
<title>
|
Analysis components for Synonyms.
|
||||||
wordnet
|
</body>
|
||||||
</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
wordnet
|
|
||||||
</body>
|
|
||||||
</html>
|
</html>
|
|
@ -0,0 +1,144 @@
|
||||||
|
package org.apache.lucene.analysis.synonym;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.text.ParseException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests parser for the Solr synonyms format
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
/** Tests some simple examples from the solr wiki */
|
||||||
|
public void testSimple() throws Exception {
|
||||||
|
String testFile =
|
||||||
|
"i-pod, ipod, ipoooood\n" +
|
||||||
|
"foo => foo bar\n" +
|
||||||
|
"foo => baz\n" +
|
||||||
|
"this test, that testing";
|
||||||
|
|
||||||
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
|
||||||
|
parser.add(new StringReader(testFile));
|
||||||
|
final SynonymMap map = parser.build();
|
||||||
|
|
||||||
|
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "ball",
|
||||||
|
new String[] { "ball" },
|
||||||
|
new int[] { 1 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "i-pod",
|
||||||
|
new String[] { "i-pod", "ipod", "ipoooood" },
|
||||||
|
new int[] { 1, 0, 0 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "foo",
|
||||||
|
new String[] { "foo", "baz", "bar" },
|
||||||
|
new int[] { 1, 0, 1 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "this test",
|
||||||
|
new String[] { "this", "that", "test", "testing" },
|
||||||
|
new int[] { 1, 0, 1, 0 });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** parse a syn file with bad syntax */
|
||||||
|
@Test(expected=ParseException.class)
|
||||||
|
public void testInvalidDoubleMap() throws Exception {
|
||||||
|
String testFile = "a => b => c";
|
||||||
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
|
||||||
|
parser.add(new StringReader(testFile));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** parse a syn file with bad syntax */
|
||||||
|
@Test(expected=ParseException.class)
|
||||||
|
public void testInvalidAnalyzesToNothingOutput() throws Exception {
|
||||||
|
String testFile = "a => 1";
|
||||||
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
|
||||||
|
parser.add(new StringReader(testFile));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** parse a syn file with bad syntax */
|
||||||
|
@Test(expected=ParseException.class)
|
||||||
|
public void testInvalidAnalyzesToNothingInput() throws Exception {
|
||||||
|
String testFile = "1 => a";
|
||||||
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.SIMPLE, false));
|
||||||
|
parser.add(new StringReader(testFile));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** parse a syn file with bad syntax */
|
||||||
|
@Test(expected=ParseException.class)
|
||||||
|
public void testInvalidPositionsInput() throws Exception {
|
||||||
|
String testFile = "testola => the test";
|
||||||
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
|
||||||
|
parser.add(new StringReader(testFile));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** parse a syn file with bad syntax */
|
||||||
|
@Test(expected=ParseException.class)
|
||||||
|
public void testInvalidPositionsOutput() throws Exception {
|
||||||
|
String testFile = "the test => testola";
|
||||||
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new EnglishAnalyzer(TEST_VERSION_CURRENT));
|
||||||
|
parser.add(new StringReader(testFile));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** parse a syn file with some escaped syntax chars */
|
||||||
|
public void testEscapedStuff() throws Exception {
|
||||||
|
String testFile =
|
||||||
|
"a\\=>a => b\\=>b\n" +
|
||||||
|
"a\\,a => b\\,b";
|
||||||
|
SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random, MockTokenizer.KEYWORD, false));
|
||||||
|
parser.add(new StringReader(testFile));
|
||||||
|
final SynonymMap map = parser.build();
|
||||||
|
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "ball",
|
||||||
|
new String[] { "ball" },
|
||||||
|
new int[] { 1 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "a=>a",
|
||||||
|
new String[] { "b=>b" },
|
||||||
|
new int[] { 1 });
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "a,a",
|
||||||
|
new String[] { "b,b" },
|
||||||
|
new int[] { 1 });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,393 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.synonym;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.*;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
private SynonymMap.Builder b;
|
||||||
|
private Tokenizer tokensIn;
|
||||||
|
private SynonymFilter tokensOut;
|
||||||
|
private CharTermAttribute termAtt;
|
||||||
|
private PositionIncrementAttribute posIncrAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
|
private void add(String input, String output, boolean keepOrig) {
|
||||||
|
b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
|
||||||
|
new CharsRef(output.replaceAll(" +", "\u0000")),
|
||||||
|
keepOrig);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertEquals(CharTermAttribute term, String expected) {
|
||||||
|
assertEquals(expected.length(), term.length());
|
||||||
|
final char[] buffer = term.buffer();
|
||||||
|
for(int chIDX=0;chIDX<expected.length();chIDX++) {
|
||||||
|
assertEquals(expected.charAt(chIDX), buffer[chIDX]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verify(String input, String output) throws Exception {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: verify input=" + input + " expectedOutput=" + output);
|
||||||
|
}
|
||||||
|
|
||||||
|
tokensIn.reset(new StringReader(input));
|
||||||
|
tokensOut.reset();
|
||||||
|
final String[] expected = output.split(" ");
|
||||||
|
int expectedUpto = 0;
|
||||||
|
while(tokensOut.incrementToken()) {
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertTrue(expectedUpto < expected.length);
|
||||||
|
final int startOffset = offsetAtt.startOffset();
|
||||||
|
final int endOffset = offsetAtt.endOffset();
|
||||||
|
|
||||||
|
final String[] expectedAtPos = expected[expectedUpto++].split("/");
|
||||||
|
for(int atPos=0;atPos<expectedAtPos.length;atPos++) {
|
||||||
|
if (atPos > 0) {
|
||||||
|
assertTrue(tokensOut.incrementToken());
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals(termAtt, expectedAtPos[atPos]);
|
||||||
|
assertEquals(atPos == 0 ? 1 : 0,
|
||||||
|
posIncrAtt.getPositionIncrement());
|
||||||
|
// start/end offset of all tokens at same pos should
|
||||||
|
// be the same:
|
||||||
|
assertEquals(startOffset, offsetAtt.startOffset());
|
||||||
|
assertEquals(endOffset, offsetAtt.endOffset());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tokensOut.end();
|
||||||
|
tokensOut.close();
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" incr: END");
|
||||||
|
}
|
||||||
|
assertEquals(expectedUpto, expected.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBasic() throws Exception {
|
||||||
|
b = new SynonymMap.Builder(true);
|
||||||
|
add("a", "foo", true);
|
||||||
|
add("a b", "bar fee", true);
|
||||||
|
add("b c", "dog collar", true);
|
||||||
|
add("c d", "dog harness holder extras", true);
|
||||||
|
add("m c e", "dog barks loudly", false);
|
||||||
|
|
||||||
|
add("e f", "foo bar", false);
|
||||||
|
add("e f", "baz bee", false);
|
||||||
|
|
||||||
|
add("z", "boo", false);
|
||||||
|
add("y", "bee", true);
|
||||||
|
|
||||||
|
tokensIn = new MockTokenizer(new StringReader("a"),
|
||||||
|
MockTokenizer.WHITESPACE,
|
||||||
|
true);
|
||||||
|
tokensIn.reset();
|
||||||
|
assertTrue(tokensIn.incrementToken());
|
||||||
|
assertFalse(tokensIn.incrementToken());
|
||||||
|
tokensIn.end();
|
||||||
|
tokensIn.close();
|
||||||
|
|
||||||
|
tokensOut = new SynonymFilter(tokensIn,
|
||||||
|
b.build(),
|
||||||
|
true);
|
||||||
|
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
|
||||||
|
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
verify("a b c", "a/bar b/fee c");
|
||||||
|
|
||||||
|
// syn output extends beyond input tokens
|
||||||
|
verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");
|
||||||
|
|
||||||
|
verify("a b a", "a/bar b/fee a/foo");
|
||||||
|
|
||||||
|
// outputs that add to one another:
|
||||||
|
verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");
|
||||||
|
|
||||||
|
// two outputs for same input
|
||||||
|
verify("e f", "foo/baz bar/bee");
|
||||||
|
|
||||||
|
// mixed keepOrig true/false:
|
||||||
|
verify("a m c e x", "a/foo dog barks loudly x");
|
||||||
|
verify("c d m c e x", "c/dog d/harness m/holder/dog c/extras/barks loudly x");
|
||||||
|
assertTrue(tokensOut.getCaptureCount() > 0);
|
||||||
|
|
||||||
|
// no captureStates when no syns matched
|
||||||
|
verify("p q r s t", "p q r s t");
|
||||||
|
assertEquals(0, tokensOut.getCaptureCount());
|
||||||
|
|
||||||
|
// no captureStates when only single-input syns, w/ no
|
||||||
|
// lookahead needed, matched
|
||||||
|
verify("p q z y t", "p q boo y/bee t");
|
||||||
|
assertEquals(0, tokensOut.getCaptureCount());
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getRandomString(char start, int alphabetSize, int length) {
|
||||||
|
assert alphabetSize <= 26;
|
||||||
|
char[] s = new char[2*length];
|
||||||
|
for(int charIDX=0;charIDX<length;charIDX++) {
|
||||||
|
s[2*charIDX] = (char) (start + random.nextInt(alphabetSize));
|
||||||
|
s[2*charIDX+1] = ' ';
|
||||||
|
}
|
||||||
|
return new String(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class OneSyn {
|
||||||
|
String in;
|
||||||
|
List<String> out;
|
||||||
|
boolean keepOrig;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String slowSynMatcher(String doc, List<OneSyn> syns, int maxOutputLength) {
|
||||||
|
assertTrue(doc.length() % 2 == 0);
|
||||||
|
final int numInputs = doc.length()/2;
|
||||||
|
boolean[] keepOrigs = new boolean[numInputs];
|
||||||
|
Arrays.fill(keepOrigs, false);
|
||||||
|
String[] outputs = new String[numInputs + maxOutputLength];
|
||||||
|
OneSyn[] matches = new OneSyn[numInputs];
|
||||||
|
for(OneSyn syn : syns) {
|
||||||
|
int idx = -1;
|
||||||
|
while(true) {
|
||||||
|
idx = doc.indexOf(syn.in, 1+idx);
|
||||||
|
if (idx == -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
assertTrue(idx % 2 == 0);
|
||||||
|
final int matchIDX = idx/2;
|
||||||
|
assertTrue(syn.in.length() % 2 == 1);
|
||||||
|
if (matches[matchIDX] == null) {
|
||||||
|
matches[matchIDX] = syn;
|
||||||
|
} else if (syn.in.length() > matches[matchIDX].in.length()) {
|
||||||
|
// Greedy conflict resolution: longer match wins:
|
||||||
|
matches[matchIDX] = syn;
|
||||||
|
} else {
|
||||||
|
assertTrue(syn.in.length() < matches[matchIDX].in.length());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Greedy conflict resolution: if syn matches a range of inputs,
|
||||||
|
// it prevents other syns from matching that range
|
||||||
|
for(int inputIDX=0;inputIDX<numInputs;inputIDX++) {
|
||||||
|
final OneSyn match = matches[inputIDX];
|
||||||
|
if (match != null) {
|
||||||
|
final int synInLength = (1+match.in.length())/2;
|
||||||
|
for(int nextInputIDX=inputIDX+1;nextInputIDX<numInputs && nextInputIDX<(inputIDX+synInLength);nextInputIDX++) {
|
||||||
|
matches[nextInputIDX] = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fill overlapping outputs:
|
||||||
|
for(int inputIDX=0;inputIDX<numInputs;inputIDX++) {
|
||||||
|
final OneSyn syn = matches[inputIDX];
|
||||||
|
if (syn == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for(String synOut : syn.out) {
|
||||||
|
final String[] synOutputs = synOut.split(" ");
|
||||||
|
assertEquals(synOutputs.length, (1+synOut.length())/2);
|
||||||
|
final int matchEnd = inputIDX + synOutputs.length;
|
||||||
|
int synUpto = 0;
|
||||||
|
for(int matchIDX=inputIDX;matchIDX<matchEnd;matchIDX++) {
|
||||||
|
if (outputs[matchIDX] == null) {
|
||||||
|
outputs[matchIDX] = synOutputs[synUpto++];
|
||||||
|
} else {
|
||||||
|
outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
|
||||||
|
}
|
||||||
|
if (matchIDX < numInputs) {
|
||||||
|
keepOrigs[matchIDX] |= syn.keepOrig;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
String[] inputTokens = doc.split(" ");
|
||||||
|
final int limit = inputTokens.length + maxOutputLength;
|
||||||
|
for(int inputIDX=0;inputIDX<limit;inputIDX++) {
|
||||||
|
boolean posHasOutput = false;
|
||||||
|
if (inputIDX >= numInputs && outputs[inputIDX] == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (inputIDX < numInputs && (outputs[inputIDX] == null || keepOrigs[inputIDX])) {
|
||||||
|
sb.append(inputTokens[inputIDX]);
|
||||||
|
posHasOutput = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outputs[inputIDX] != null) {
|
||||||
|
if (posHasOutput) {
|
||||||
|
sb.append('/');
|
||||||
|
}
|
||||||
|
sb.append(outputs[inputIDX]);
|
||||||
|
}
|
||||||
|
if (inputIDX < limit-1) {
|
||||||
|
sb.append(' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandom() throws Exception {
|
||||||
|
|
||||||
|
final int alphabetSize = _TestUtil.nextInt(random, 2, 7);
|
||||||
|
|
||||||
|
final int docLen = atLeast(3000);
|
||||||
|
//final int docLen = 50;
|
||||||
|
|
||||||
|
final String document = getRandomString('a', alphabetSize, docLen);
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: doc=" + document);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int numSyn = atLeast(5);
|
||||||
|
//final int numSyn = 2;
|
||||||
|
|
||||||
|
final Map<String,OneSyn> synMap = new HashMap<String,OneSyn>();
|
||||||
|
final List<OneSyn> syns = new ArrayList<OneSyn>();
|
||||||
|
final boolean dedup = random.nextBoolean();
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" dedup=" + dedup);
|
||||||
|
}
|
||||||
|
b = new SynonymMap.Builder(dedup);
|
||||||
|
for(int synIDX=0;synIDX<numSyn;synIDX++) {
|
||||||
|
final String synIn = getRandomString('a', alphabetSize, _TestUtil.nextInt(random, 1, 5)).trim();
|
||||||
|
OneSyn s = synMap.get(synIn);
|
||||||
|
if (s == null) {
|
||||||
|
s = new OneSyn();
|
||||||
|
s.in = synIn;
|
||||||
|
syns.add(s);
|
||||||
|
s.out = new ArrayList<String>();
|
||||||
|
synMap.put(synIn, s);
|
||||||
|
s.keepOrig = random.nextBoolean();
|
||||||
|
}
|
||||||
|
final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim();
|
||||||
|
s.out.add(synOut);
|
||||||
|
add(synIn, synOut, s.keepOrig);
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokensIn = new MockTokenizer(new StringReader("a"),
|
||||||
|
MockTokenizer.WHITESPACE,
|
||||||
|
true);
|
||||||
|
tokensIn.reset();
|
||||||
|
assertTrue(tokensIn.incrementToken());
|
||||||
|
assertFalse(tokensIn.incrementToken());
|
||||||
|
tokensIn.end();
|
||||||
|
tokensIn.close();
|
||||||
|
|
||||||
|
tokensOut = new SynonymFilter(tokensIn,
|
||||||
|
b.build(),
|
||||||
|
true);
|
||||||
|
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
|
||||||
|
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
if (dedup) {
|
||||||
|
pruneDups(syns);
|
||||||
|
}
|
||||||
|
|
||||||
|
final String expected = slowSynMatcher(document, syns, 5);
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: expected=" + expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
verify(document, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void pruneDups(List<OneSyn> syns) {
|
||||||
|
Set<String> seen = new HashSet<String>();
|
||||||
|
for(OneSyn syn : syns) {
|
||||||
|
int idx = 0;
|
||||||
|
while(idx < syn.out.size()) {
|
||||||
|
String out = syn.out.get(idx);
|
||||||
|
if (!seen.contains(out)) {
|
||||||
|
seen.add(out);
|
||||||
|
idx++;
|
||||||
|
} else {
|
||||||
|
syn.out.remove(idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
seen.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String randomNonEmptyString() {
|
||||||
|
while(true) {
|
||||||
|
final String s = _TestUtil.randomUnicodeString(random).trim();
|
||||||
|
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** simple random test, doesn't verify correctness.
|
||||||
|
* does verify it doesnt throw exceptions, or that the stream doesn't misbehave
|
||||||
|
*/
|
||||||
|
public void testRandom2() throws Exception {
|
||||||
|
final int numIters = atLeast(10);
|
||||||
|
for (int i = 0; i < numIters; i++) {
|
||||||
|
b = new SynonymMap.Builder(random.nextBoolean());
|
||||||
|
final int numEntries = atLeast(10);
|
||||||
|
for (int j = 0; j < numEntries; j++) {
|
||||||
|
add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
|
||||||
|
}
|
||||||
|
final SynonymMap map = b.build();
|
||||||
|
final boolean ignoreCase = random.nextBoolean();
|
||||||
|
|
||||||
|
final Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,72 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.synonym;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
public class TestWordnetSynonymParser extends BaseTokenStreamTestCase {
|
||||||
|
Analyzer analyzer;
|
||||||
|
|
||||||
|
String synonymsFile =
|
||||||
|
"s(100000001,1,'woods',n,1,0).\n" +
|
||||||
|
"s(100000001,2,'wood',n,1,0).\n" +
|
||||||
|
"s(100000001,3,'forest',n,1,0).\n" +
|
||||||
|
"s(100000002,1,'wolfish',n,1,0).\n" +
|
||||||
|
"s(100000002,2,'ravenous',n,1,0).\n" +
|
||||||
|
"s(100000003,1,'king',n,1,1).\n" +
|
||||||
|
"s(100000003,2,'baron',n,1,1).\n" +
|
||||||
|
"s(100000004,1,'king''s evil',n,1,1).\n" +
|
||||||
|
"s(100000004,2,'king''s meany',n,1,1).\n";
|
||||||
|
|
||||||
|
public void testSynonyms() throws Exception {
|
||||||
|
WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random));
|
||||||
|
parser.add(new StringReader(synonymsFile));
|
||||||
|
final SynonymMap map = parser.build();
|
||||||
|
|
||||||
|
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* all expansions */
|
||||||
|
assertAnalyzesTo(analyzer, "Lost in the woods",
|
||||||
|
new String[] { "Lost", "in", "the", "woods", "wood", "forest" },
|
||||||
|
new int[] { 0, 5, 8, 12, 12, 12 },
|
||||||
|
new int[] { 4, 7, 11, 17, 17, 17 },
|
||||||
|
new int[] { 1, 1, 1, 1, 0, 0 });
|
||||||
|
|
||||||
|
/* single quote */
|
||||||
|
assertAnalyzesTo(analyzer, "king",
|
||||||
|
new String[] { "king", "baron" });
|
||||||
|
|
||||||
|
/* multi words */
|
||||||
|
assertAnalyzesTo(analyzer, "king's evil",
|
||||||
|
new String[] { "king's", "king's", "evil", "meany" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -90,6 +90,10 @@ import org.apache.lucene.store.OutputStreamDataOutput;
|
||||||
*
|
*
|
||||||
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
|
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
|
||||||
* nothing else.
|
* nothing else.
|
||||||
|
*
|
||||||
|
* <b>NOTE</b>: the FST file format is experimental and
|
||||||
|
* subject to suddenly change, requiring you to rebuild the
|
||||||
|
* FST suggest index.
|
||||||
*/
|
*/
|
||||||
public class FSTLookup extends Lookup {
|
public class FSTLookup extends Lookup {
|
||||||
|
|
||||||
|
|
|
@ -320,6 +320,9 @@ New Features
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
* LUCENE-3233: Improved memory usage, build time, and performance of
|
||||||
|
SynonymFilterFactory. (Mike McCandless, Robert Muir)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,157 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.nio.charset.CharsetDecoder;
|
||||||
|
import java.nio.charset.CodingErrorAction;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||||
|
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||||
|
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
|
||||||
|
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
import org.apache.solr.common.util.StrUtils;
|
||||||
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. this is only a backwards compatibility
|
||||||
|
* mechanism that will be removed in Lucene 5.0
|
||||||
|
*/
|
||||||
|
// NOTE: rename this to "SynonymFilterFactory" and nuke that delegator in Lucene 5.0!
|
||||||
|
@Deprecated
|
||||||
|
final class FSTSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
private SynonymMap map;
|
||||||
|
private boolean ignoreCase;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new SynonymFilter(input, map, ignoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void inform(ResourceLoader loader) {
|
||||||
|
final boolean ignoreCase = getBoolean("ignoreCase", false);
|
||||||
|
this.ignoreCase = ignoreCase;
|
||||||
|
|
||||||
|
String tf = args.get("tokenizerFactory");
|
||||||
|
|
||||||
|
final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf, args);
|
||||||
|
|
||||||
|
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader);
|
||||||
|
TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer;
|
||||||
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
String format = args.get("format");
|
||||||
|
try {
|
||||||
|
if (format == null || format.equals("solr")) {
|
||||||
|
// TODO: expose dedup as a parameter?
|
||||||
|
map = loadSolrSynonyms(loader, true, analyzer);
|
||||||
|
} else if (format.equals("wordnet")) {
|
||||||
|
map = loadWordnetSynonyms(loader, true, analyzer);
|
||||||
|
} else {
|
||||||
|
// TODO: somehow make this more pluggable
|
||||||
|
throw new RuntimeException("Unrecognized synonyms format: " + format);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load synonyms from the solr format, "format=solr".
|
||||||
|
*/
|
||||||
|
private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
|
||||||
|
final boolean expand = getBoolean("expand", true);
|
||||||
|
String synonyms = args.get("synonyms");
|
||||||
|
if (synonyms == null)
|
||||||
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
|
||||||
|
|
||||||
|
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
|
||||||
|
.onMalformedInput(CodingErrorAction.REPORT)
|
||||||
|
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||||
|
|
||||||
|
SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer);
|
||||||
|
File synonymFile = new File(synonyms);
|
||||||
|
if (synonymFile.exists()) {
|
||||||
|
decoder.reset();
|
||||||
|
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
|
||||||
|
} else {
|
||||||
|
List<String> files = StrUtils.splitFileNames(synonyms);
|
||||||
|
for (String file : files) {
|
||||||
|
decoder.reset();
|
||||||
|
parser.add(new InputStreamReader(loader.openResource(file), decoder));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return parser.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load synonyms from the wordnet format, "format=wordnet".
|
||||||
|
*/
|
||||||
|
private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
|
||||||
|
final boolean expand = getBoolean("expand", true);
|
||||||
|
String synonyms = args.get("synonyms");
|
||||||
|
if (synonyms == null)
|
||||||
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
|
||||||
|
|
||||||
|
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
|
||||||
|
.onMalformedInput(CodingErrorAction.REPORT)
|
||||||
|
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||||
|
|
||||||
|
WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer);
|
||||||
|
File synonymFile = new File(synonyms);
|
||||||
|
if (synonymFile.exists()) {
|
||||||
|
decoder.reset();
|
||||||
|
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
|
||||||
|
} else {
|
||||||
|
List<String> files = StrUtils.splitFileNames(synonyms);
|
||||||
|
for (String file : files) {
|
||||||
|
decoder.reset();
|
||||||
|
parser.add(new InputStreamReader(loader.openResource(file), decoder));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return parser.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
|
||||||
|
TokenizerFactory tokFactory = (TokenizerFactory) loader.newInstance(cname);
|
||||||
|
tokFactory.init(args);
|
||||||
|
return tokFactory;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,261 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
|
||||||
|
/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
|
||||||
|
* <p>
|
||||||
|
* The matched tokens from the input stream may be optionally passed through (includeOrig=true)
|
||||||
|
* or discarded. If the original tokens are included, the position increments may be modified
|
||||||
|
* to retain absolute positions after merging with the synonym tokenstream.
|
||||||
|
* <p>
|
||||||
|
* Generated synonyms will start at the same position as the first matched source token.
|
||||||
|
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
final class SlowSynonymFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final SlowSynonymMap map; // Map<String, SynonymMap>
|
||||||
|
private Iterator<AttributeSource> replacement; // iterator over generated tokens
|
||||||
|
|
||||||
|
public SlowSynonymFilter(TokenStream in, SlowSynonymMap map) {
|
||||||
|
super(in);
|
||||||
|
if (map == null)
|
||||||
|
throw new IllegalArgumentException("map is required");
|
||||||
|
|
||||||
|
this.map = map;
|
||||||
|
// just ensuring these attributes exist...
|
||||||
|
addAttribute(CharTermAttribute.class);
|
||||||
|
addAttribute(PositionIncrementAttribute.class);
|
||||||
|
addAttribute(OffsetAttribute.class);
|
||||||
|
addAttribute(TypeAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Need to worry about multiple scenarios:
|
||||||
|
* - need to go for the longest match
|
||||||
|
* a b => foo #shouldn't match if "a b" is followed by "c d"
|
||||||
|
* a b c d => bar
|
||||||
|
* - need to backtrack - retry matches for tokens already read
|
||||||
|
* a b c d => foo
|
||||||
|
* b c => bar
|
||||||
|
* If the input stream is "a b c x", one will consume "a b c d"
|
||||||
|
* trying to match the first rule... all but "a" should be
|
||||||
|
* pushed back so a match may be made on "b c".
|
||||||
|
* - don't try and match generated tokens (thus need separate queue)
|
||||||
|
* matching is not recursive.
|
||||||
|
* - handle optional generation of original tokens in all these cases,
|
||||||
|
* merging token streams to preserve token positions.
|
||||||
|
* - preserve original positionIncrement of first matched token
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
while (true) {
|
||||||
|
// if there are any generated tokens, return them... don't try any
|
||||||
|
// matches against them, as we specifically don't want recursion.
|
||||||
|
if (replacement!=null && replacement.hasNext()) {
|
||||||
|
copy(this, replacement.next());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// common case fast-path of first token not matching anything
|
||||||
|
AttributeSource firstTok = nextTok();
|
||||||
|
if (firstTok == null) return false;
|
||||||
|
CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
|
||||||
|
SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
|
||||||
|
if (result == null) {
|
||||||
|
copy(this, firstTok);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// fast-path failed, clone ourselves if needed
|
||||||
|
if (firstTok == this)
|
||||||
|
firstTok = cloneAttributes();
|
||||||
|
// OK, we matched a token, so find the longest match.
|
||||||
|
|
||||||
|
matched = new LinkedList<AttributeSource>();
|
||||||
|
|
||||||
|
result = match(result);
|
||||||
|
|
||||||
|
if (result==null) {
|
||||||
|
// no match, simply return the first token read.
|
||||||
|
copy(this, firstTok);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// reuse, or create new one each time?
|
||||||
|
ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);
|
||||||
|
|
||||||
|
//
|
||||||
|
// there was a match... let's generate the new tokens, merging
|
||||||
|
// in the matched tokens (position increments need adjusting)
|
||||||
|
//
|
||||||
|
AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
|
||||||
|
boolean includeOrig = result.includeOrig();
|
||||||
|
|
||||||
|
AttributeSource origTok = includeOrig ? firstTok : null;
|
||||||
|
PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
|
||||||
|
int repPos=0; // curr position in replacement token stream
|
||||||
|
int pos=0; // current position in merged token stream
|
||||||
|
|
||||||
|
for (int i=0; i<result.synonyms.length; i++) {
|
||||||
|
Token repTok = result.synonyms[i];
|
||||||
|
AttributeSource newTok = firstTok.cloneAttributes();
|
||||||
|
CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
|
||||||
|
OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
|
||||||
|
PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
||||||
|
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
|
||||||
|
repPos += repTok.getPositionIncrement();
|
||||||
|
if (i==0) repPos=origPos; // make position of first token equal to original
|
||||||
|
|
||||||
|
// if necessary, insert original tokens and adjust position increment
|
||||||
|
while (origTok != null && origPos <= repPos) {
|
||||||
|
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
origPosInc.setPositionIncrement(origPos-pos);
|
||||||
|
generated.add(origTok);
|
||||||
|
pos += origPosInc.getPositionIncrement();
|
||||||
|
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||||
|
if (origTok != null) {
|
||||||
|
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
origPos += origPosInc.getPositionIncrement();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
newPosIncAtt.setPositionIncrement(repPos - pos);
|
||||||
|
generated.add(newTok);
|
||||||
|
pos += newPosIncAtt.getPositionIncrement();
|
||||||
|
}
|
||||||
|
|
||||||
|
// finish up any leftover original tokens
|
||||||
|
while (origTok!=null) {
|
||||||
|
PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
origPosInc.setPositionIncrement(origPos-pos);
|
||||||
|
generated.add(origTok);
|
||||||
|
pos += origPosInc.getPositionIncrement();
|
||||||
|
origTok = matched.isEmpty() ? null : matched.removeFirst();
|
||||||
|
if (origTok != null) {
|
||||||
|
origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
origPos += origPosInc.getPositionIncrement();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// what if we replaced a longer sequence with a shorter one?
|
||||||
|
// a/0 b/5 => foo/0
|
||||||
|
// should I re-create the gap on the next buffered token?
|
||||||
|
|
||||||
|
replacement = generated.iterator();
|
||||||
|
// Now return to the top of the loop to read and return the first
|
||||||
|
// generated token.. The reason this is done is that we may have generated
|
||||||
|
// nothing at all, and may need to continue with more matching logic.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Defer creation of the buffer until the first time it is used to
|
||||||
|
// optimize short fields with no matches.
|
||||||
|
//
|
||||||
|
private LinkedList<AttributeSource> buffer;
|
||||||
|
private LinkedList<AttributeSource> matched;
|
||||||
|
|
||||||
|
private boolean exhausted;
|
||||||
|
|
||||||
|
private AttributeSource nextTok() throws IOException {
|
||||||
|
if (buffer!=null && !buffer.isEmpty()) {
|
||||||
|
return buffer.removeFirst();
|
||||||
|
} else {
|
||||||
|
if (!exhausted && input.incrementToken()) {
|
||||||
|
return this;
|
||||||
|
} else {
|
||||||
|
exhausted = true;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void pushTok(AttributeSource t) {
|
||||||
|
if (buffer==null) buffer=new LinkedList<AttributeSource>();
|
||||||
|
buffer.addFirst(t);
|
||||||
|
}
|
||||||
|
|
||||||
|
private SlowSynonymMap match(SlowSynonymMap map) throws IOException {
|
||||||
|
SlowSynonymMap result = null;
|
||||||
|
|
||||||
|
if (map.submap != null) {
|
||||||
|
AttributeSource tok = nextTok();
|
||||||
|
if (tok != null) {
|
||||||
|
// clone ourselves.
|
||||||
|
if (tok == this)
|
||||||
|
tok = cloneAttributes();
|
||||||
|
// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
|
||||||
|
CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
|
||||||
|
SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());
|
||||||
|
|
||||||
|
if (subMap != null) {
|
||||||
|
// recurse
|
||||||
|
result = match(subMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result != null) {
|
||||||
|
matched.addFirst(tok);
|
||||||
|
} else {
|
||||||
|
// push back unmatched token
|
||||||
|
pushTok(tok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if no longer sequence matched, so if this node has synonyms, it's the match.
|
||||||
|
if (result==null && map.synonyms!=null) {
|
||||||
|
result = map;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void copy(AttributeSource target, AttributeSource source) {
|
||||||
|
if (target != source)
|
||||||
|
source.copyTo(target);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
input.reset();
|
||||||
|
replacement = null;
|
||||||
|
exhausted = false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,188 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
import org.apache.solr.common.util.StrUtils;
|
||||||
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link SlowSynonymFilter} (only used with luceneMatchVersion < 3.4)
|
||||||
|
* <pre class="prettyprint" >
|
||||||
|
* <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
|
||||||
|
* expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
final class SlowSynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
|
||||||
|
public void inform(ResourceLoader loader) {
|
||||||
|
String synonyms = args.get("synonyms");
|
||||||
|
if (synonyms == null)
|
||||||
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
|
||||||
|
boolean ignoreCase = getBoolean("ignoreCase", false);
|
||||||
|
boolean expand = getBoolean("expand", true);
|
||||||
|
|
||||||
|
String tf = args.get("tokenizerFactory");
|
||||||
|
TokenizerFactory tokFactory = null;
|
||||||
|
if( tf != null ){
|
||||||
|
tokFactory = loadTokenizerFactory( loader, tf, args );
|
||||||
|
}
|
||||||
|
|
||||||
|
Iterable<String> wlist=loadRules( synonyms, loader );
|
||||||
|
|
||||||
|
synMap = new SlowSynonymMap(ignoreCase);
|
||||||
|
parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return a list of all rules
|
||||||
|
*/
|
||||||
|
protected Iterable<String> loadRules( String synonyms, ResourceLoader loader ) {
|
||||||
|
List<String> wlist=null;
|
||||||
|
try {
|
||||||
|
File synonymFile = new File(synonyms);
|
||||||
|
if (synonymFile.exists()) {
|
||||||
|
wlist = loader.getLines(synonyms);
|
||||||
|
} else {
|
||||||
|
List<String> files = StrUtils.splitFileNames(synonyms);
|
||||||
|
wlist = new ArrayList<String>();
|
||||||
|
for (String file : files) {
|
||||||
|
List<String> lines = loader.getLines(file.trim());
|
||||||
|
wlist.addAll(lines);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
return wlist;
|
||||||
|
}
|
||||||
|
|
||||||
|
private SlowSynonymMap synMap;
|
||||||
|
|
||||||
|
static void parseRules(Iterable<String> rules, SlowSynonymMap map, String mappingSep,
|
||||||
|
String synSep, boolean expansion, TokenizerFactory tokFactory) {
|
||||||
|
int count=0;
|
||||||
|
for (String rule : rules) {
|
||||||
|
// To use regexes, we need an expression that specifies an odd number of chars.
|
||||||
|
// This can't really be done with string.split(), and since we need to
|
||||||
|
// do unescaping at some point anyway, we wouldn't be saving any effort
|
||||||
|
// by using regexes.
|
||||||
|
|
||||||
|
List<String> mapping = StrUtils.splitSmart(rule, mappingSep, false);
|
||||||
|
|
||||||
|
List<List<String>> source;
|
||||||
|
List<List<String>> target;
|
||||||
|
|
||||||
|
if (mapping.size() > 2) {
|
||||||
|
throw new RuntimeException("Invalid Synonym Rule:" + rule);
|
||||||
|
} else if (mapping.size()==2) {
|
||||||
|
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||||
|
target = getSynList(mapping.get(1), synSep, tokFactory);
|
||||||
|
} else {
|
||||||
|
source = getSynList(mapping.get(0), synSep, tokFactory);
|
||||||
|
if (expansion) {
|
||||||
|
// expand to all arguments
|
||||||
|
target = source;
|
||||||
|
} else {
|
||||||
|
// reduce to first argument
|
||||||
|
target = new ArrayList<List<String>>(1);
|
||||||
|
target.add(source.get(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean includeOrig=false;
|
||||||
|
for (List<String> fromToks : source) {
|
||||||
|
count++;
|
||||||
|
for (List<String> toToks : target) {
|
||||||
|
map.add(fromToks,
|
||||||
|
SlowSynonymMap.makeTokens(toToks),
|
||||||
|
includeOrig,
|
||||||
|
true
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// a , b c , d e f => [[a],[b,c],[d,e,f]]
|
||||||
|
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
|
||||||
|
List<String> strList = StrUtils.splitSmart(str, separator, false);
|
||||||
|
// now split on whitespace to get a list of token strings
|
||||||
|
List<List<String>> synList = new ArrayList<List<String>>();
|
||||||
|
for (String toks : strList) {
|
||||||
|
List<String> tokList = tokFactory == null ?
|
||||||
|
StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
|
||||||
|
synList.add(tokList);
|
||||||
|
}
|
||||||
|
return synList;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
|
||||||
|
StringReader reader = new StringReader( source );
|
||||||
|
TokenStream ts = loadTokenizer(tokFactory, reader);
|
||||||
|
List<String> tokList = new ArrayList<String>();
|
||||||
|
try {
|
||||||
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
while (ts.incrementToken()){
|
||||||
|
if( termAtt.length() > 0 )
|
||||||
|
tokList.add( termAtt.toString() );
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
finally{
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
return tokList;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
|
||||||
|
TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
|
||||||
|
tokFactory.init( args );
|
||||||
|
return tokFactory;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
|
||||||
|
return tokFactory.create( reader );
|
||||||
|
}
|
||||||
|
|
||||||
|
public SlowSynonymMap getSynonymMap() {
|
||||||
|
return synMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SlowSynonymFilter create(TokenStream input) {
|
||||||
|
return new SlowSynonymFilter(input,synMap);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,162 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/** Mapping rules for use with {@link SlowSynonymFilter}
|
||||||
|
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
class SlowSynonymMap {
|
||||||
|
/** @lucene.internal */
|
||||||
|
public CharArrayMap<SlowSynonymMap> submap; // recursive: Map<String, SynonymMap>
|
||||||
|
/** @lucene.internal */
|
||||||
|
public Token[] synonyms;
|
||||||
|
int flags;
|
||||||
|
|
||||||
|
static final int INCLUDE_ORIG=0x01;
|
||||||
|
static final int IGNORE_CASE=0x02;
|
||||||
|
|
||||||
|
public SlowSynonymMap() {}
|
||||||
|
public SlowSynonymMap(boolean ignoreCase) {
|
||||||
|
if (ignoreCase) flags |= IGNORE_CASE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean includeOrig() { return (flags & INCLUDE_ORIG) != 0; }
|
||||||
|
public boolean ignoreCase() { return (flags & IGNORE_CASE) != 0; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param singleMatch List<String>, the sequence of strings to match
|
||||||
|
* @param replacement List<Token> the list of tokens to use on a match
|
||||||
|
* @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
|
||||||
|
* @param mergeExisting merge the replacement tokens with any other mappings that exist
|
||||||
|
*/
|
||||||
|
public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
|
||||||
|
SlowSynonymMap currMap = this;
|
||||||
|
for (String str : singleMatch) {
|
||||||
|
if (currMap.submap==null) {
|
||||||
|
// for now hardcode at 4.0, as its what the old code did.
|
||||||
|
// would be nice to fix, but shouldn't store a version in each submap!!!
|
||||||
|
currMap.submap = new CharArrayMap<SlowSynonymMap>(Version.LUCENE_40, 1, ignoreCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
SlowSynonymMap map = currMap.submap.get(str);
|
||||||
|
if (map==null) {
|
||||||
|
map = new SlowSynonymMap();
|
||||||
|
map.flags |= flags & IGNORE_CASE;
|
||||||
|
currMap.submap.put(str, map);
|
||||||
|
}
|
||||||
|
|
||||||
|
currMap = map;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currMap.synonyms != null && !mergeExisting) {
|
||||||
|
throw new RuntimeException("SynonymFilter: there is already a mapping for " + singleMatch);
|
||||||
|
}
|
||||||
|
List<Token> superset = currMap.synonyms==null ? replacement :
|
||||||
|
mergeTokens(Arrays.asList(currMap.synonyms), replacement);
|
||||||
|
currMap.synonyms = superset.toArray(new Token[superset.size()]);
|
||||||
|
if (includeOrig) currMap.flags |= INCLUDE_ORIG;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder sb = new StringBuilder("<");
|
||||||
|
if (synonyms!=null) {
|
||||||
|
sb.append("[");
|
||||||
|
for (int i=0; i<synonyms.length; i++) {
|
||||||
|
if (i!=0) sb.append(',');
|
||||||
|
sb.append(synonyms[i]);
|
||||||
|
}
|
||||||
|
if ((flags & INCLUDE_ORIG)!=0) {
|
||||||
|
sb.append(",ORIG");
|
||||||
|
}
|
||||||
|
sb.append("],");
|
||||||
|
}
|
||||||
|
sb.append(submap);
|
||||||
|
sb.append(">");
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/** Produces a List<Token> from a List<String> */
|
||||||
|
public static List<Token> makeTokens(List<String> strings) {
|
||||||
|
List<Token> ret = new ArrayList<Token>(strings.size());
|
||||||
|
for (String str : strings) {
|
||||||
|
//Token newTok = new Token(str,0,0,"SYNONYM");
|
||||||
|
Token newTok = new Token(str, 0,0,"SYNONYM");
|
||||||
|
ret.add(newTok);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
|
||||||
|
* the tokens end up at the same position.
|
||||||
|
*
|
||||||
|
* Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same position)
|
||||||
|
* Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n)
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
|
||||||
|
ArrayList<Token> result = new ArrayList<Token>();
|
||||||
|
if (lst1 ==null || lst2 ==null) {
|
||||||
|
if (lst2 != null) result.addAll(lst2);
|
||||||
|
if (lst1 != null) result.addAll(lst1);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int pos=0;
|
||||||
|
Iterator<Token> iter1=lst1.iterator();
|
||||||
|
Iterator<Token> iter2=lst2.iterator();
|
||||||
|
Token tok1 = iter1.hasNext() ? iter1.next() : null;
|
||||||
|
Token tok2 = iter2.hasNext() ? iter2.next() : null;
|
||||||
|
int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0;
|
||||||
|
int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0;
|
||||||
|
while(tok1!=null || tok2!=null) {
|
||||||
|
while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
|
||||||
|
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
|
||||||
|
tok.copyBuffer(tok1.buffer(), 0, tok1.length());
|
||||||
|
tok.setPositionIncrement(pos1-pos);
|
||||||
|
result.add(tok);
|
||||||
|
pos=pos1;
|
||||||
|
tok1 = iter1.hasNext() ? iter1.next() : null;
|
||||||
|
pos1 += tok1!=null ? tok1.getPositionIncrement() : 0;
|
||||||
|
}
|
||||||
|
while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
|
||||||
|
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
|
||||||
|
tok.copyBuffer(tok2.buffer(), 0, tok2.length());
|
||||||
|
tok.setPositionIncrement(pos2-pos);
|
||||||
|
result.add(tok);
|
||||||
|
pos=pos2;
|
||||||
|
tok2 = iter2.hasNext() ? iter2.next() : null;
|
||||||
|
pos2 += tok2!=null ? tok2.getPositionIncrement() : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,189 +1,54 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
import org.apache.lucene.util.Version;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.SolrException;
|
|
||||||
import org.apache.solr.common.util.StrUtils;
|
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.StringReader;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for {@link SynonymFilter}.
|
* Factory for {@link SynonymFilter}.
|
||||||
* <pre class="prettyprint" >
|
* <pre class="prettyprint" >
|
||||||
* <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
|
* <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
|
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
|
||||||
* expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
|
* format="solr" ignoreCase="false" expand="true"
|
||||||
|
* tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
private BaseTokenFilterFactory delegator;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
|
assureMatchVersion();
|
||||||
|
if (luceneMatchVersion.onOrAfter(Version.LUCENE_34)) {
|
||||||
|
delegator = new FSTSynonymFilterFactory();
|
||||||
|
} else {
|
||||||
|
// check if you use the new optional arg "format". this makes no sense for the old one,
|
||||||
|
// as its wired to solr's synonyms format only.
|
||||||
|
if (args.containsKey("format") && !args.get("format").equals("solr")) {
|
||||||
|
throw new IllegalArgumentException("You must specify luceneMatchVersion >= 3.4 to use alternate synonyms formats");
|
||||||
|
}
|
||||||
|
delegator = new SlowSynonymFilterFactory();
|
||||||
|
}
|
||||||
|
delegator.init(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
assert delegator != null : "init() was not called!";
|
||||||
|
return delegator.create(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
String synonyms = args.get("synonyms");
|
assert delegator != null : "init() was not called!";
|
||||||
if (synonyms == null)
|
((ResourceLoaderAware) delegator).inform(loader);
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");
|
|
||||||
boolean ignoreCase = getBoolean("ignoreCase", false);
|
|
||||||
boolean expand = getBoolean("expand", true);
|
|
||||||
|
|
||||||
String tf = args.get("tokenizerFactory");
|
|
||||||
TokenizerFactory tokFactory = null;
|
|
||||||
if( tf != null ){
|
|
||||||
tokFactory = loadTokenizerFactory( loader, tf, args );
|
|
||||||
}
|
|
||||||
|
|
||||||
Iterable<String> wlist=loadRules( synonyms, loader );
|
|
||||||
|
|
||||||
synMap = new SynonymMap(ignoreCase);
|
|
||||||
parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return a list of all rules
|
|
||||||
*/
|
|
||||||
protected Iterable<String> loadRules( String synonyms, ResourceLoader loader ) {
|
|
||||||
List<String> wlist=null;
|
|
||||||
try {
|
|
||||||
File synonymFile = new File(synonyms);
|
|
||||||
if (synonymFile.exists()) {
|
|
||||||
wlist = loader.getLines(synonyms);
|
|
||||||
} else {
|
|
||||||
List<String> files = StrUtils.splitFileNames(synonyms);
|
|
||||||
wlist = new ArrayList<String>();
|
|
||||||
for (String file : files) {
|
|
||||||
List<String> lines = loader.getLines(file.trim());
|
|
||||||
wlist.addAll(lines);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
return wlist;
|
|
||||||
}
|
|
||||||
|
|
||||||
private SynonymMap synMap;
|
|
||||||
|
|
||||||
static void parseRules(Iterable<String> rules, SynonymMap map, String mappingSep,
|
|
||||||
String synSep, boolean expansion, TokenizerFactory tokFactory) {
|
|
||||||
int count=0;
|
|
||||||
for (String rule : rules) {
|
|
||||||
// To use regexes, we need an expression that specifies an odd number of chars.
|
|
||||||
// This can't really be done with string.split(), and since we need to
|
|
||||||
// do unescaping at some point anyway, we wouldn't be saving any effort
|
|
||||||
// by using regexes.
|
|
||||||
|
|
||||||
List<String> mapping = StrUtils.splitSmart(rule, mappingSep, false);
|
|
||||||
|
|
||||||
List<List<String>> source;
|
|
||||||
List<List<String>> target;
|
|
||||||
|
|
||||||
if (mapping.size() > 2) {
|
|
||||||
throw new RuntimeException("Invalid Synonym Rule:" + rule);
|
|
||||||
} else if (mapping.size()==2) {
|
|
||||||
source = getSynList(mapping.get(0), synSep, tokFactory);
|
|
||||||
target = getSynList(mapping.get(1), synSep, tokFactory);
|
|
||||||
} else {
|
|
||||||
source = getSynList(mapping.get(0), synSep, tokFactory);
|
|
||||||
if (expansion) {
|
|
||||||
// expand to all arguments
|
|
||||||
target = source;
|
|
||||||
} else {
|
|
||||||
// reduce to first argument
|
|
||||||
target = new ArrayList<List<String>>(1);
|
|
||||||
target.add(source.get(0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean includeOrig=false;
|
|
||||||
for (List<String> fromToks : source) {
|
|
||||||
count++;
|
|
||||||
for (List<String> toToks : target) {
|
|
||||||
map.add(fromToks,
|
|
||||||
SynonymMap.makeTokens(toToks),
|
|
||||||
includeOrig,
|
|
||||||
true
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// a , b c , d e f => [[a],[b,c],[d,e,f]]
|
|
||||||
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
|
|
||||||
List<String> strList = StrUtils.splitSmart(str, separator, false);
|
|
||||||
// now split on whitespace to get a list of token strings
|
|
||||||
List<List<String>> synList = new ArrayList<List<String>>();
|
|
||||||
for (String toks : strList) {
|
|
||||||
List<String> tokList = tokFactory == null ?
|
|
||||||
StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
|
|
||||||
synList.add(tokList);
|
|
||||||
}
|
|
||||||
return synList;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
|
|
||||||
StringReader reader = new StringReader( source );
|
|
||||||
TokenStream ts = loadTokenizer(tokFactory, reader);
|
|
||||||
List<String> tokList = new ArrayList<String>();
|
|
||||||
try {
|
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
|
||||||
while (ts.incrementToken()){
|
|
||||||
if( termAtt.length() > 0 )
|
|
||||||
tokList.add( termAtt.toString() );
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
finally{
|
|
||||||
reader.close();
|
|
||||||
}
|
|
||||||
return tokList;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
|
|
||||||
TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
|
|
||||||
tokFactory.init( args );
|
|
||||||
return tokFactory;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
|
|
||||||
return tokFactory.create( reader );
|
|
||||||
}
|
|
||||||
|
|
||||||
public SynonymMap getSynonymMap() {
|
|
||||||
return synMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SynonymFilter create(TokenStream input) {
|
|
||||||
return new SynonymFilter(input,synMap);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,30 +17,69 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @since solr 1.4
|
* @since solr 1.4
|
||||||
*/
|
*/
|
||||||
public class TestMultiWordSynonyms extends BaseTokenTestCase {
|
public class TestMultiWordSynonyms extends BaseTokenTestCase {
|
||||||
|
|
||||||
@Test
|
/**
|
||||||
public void testMultiWordSynonyms() throws IOException {
|
* @deprecated Remove this test in 5.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public void testMultiWordSynonymsOld() throws IOException {
|
||||||
List<String> rules = new ArrayList<String>();
|
List<String> rules = new ArrayList<String>();
|
||||||
rules.add("a b c,d");
|
rules.add("a b c,d");
|
||||||
SynonymMap synMap = new SynonymMap(true);
|
SlowSynonymMap synMap = new SlowSynonymMap(true);
|
||||||
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
|
||||||
|
|
||||||
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
|
SlowSynonymFilter ts = new SlowSynonymFilter(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("a e")), synMap);
|
||||||
// This fails because ["e","e"] is the value of the token stream
|
// This fails because ["e","e"] is the value of the token stream
|
||||||
assertTokenStreamContents(ts, new String[] { "a", "e" });
|
assertTokenStreamContents(ts, new String[] { "a", "e" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMultiWordSynonyms() throws IOException {
|
||||||
|
SynonymFilterFactory factory = new SynonymFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.putAll(DEFAULT_VERSION_PARAM);
|
||||||
|
args.put("synonyms", "synonyms.txt");
|
||||||
|
factory.init(args);
|
||||||
|
factory.inform(new StringMockSolrResourceLoader("a b c,d"));
|
||||||
|
TokenStream ts = factory.create(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false));
|
||||||
|
// This fails because ["e","e"] is the value of the token stream
|
||||||
|
assertTokenStreamContents(ts, new String[] { "a", "e" });
|
||||||
|
}
|
||||||
|
|
||||||
|
private class StringMockSolrResourceLoader implements ResourceLoader {
|
||||||
|
String text;
|
||||||
|
|
||||||
|
StringMockSolrResourceLoader(String text) {
|
||||||
|
this.text = text;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getLines(String resource) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object newInstance(String cname, String... subpackages) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public InputStream openResource(String resource) throws IOException {
|
||||||
|
return new ByteArrayInputStream(text.getBytes("UTF-8"));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.analysis.synonym;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -29,51 +29,52 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.*;
|
import org.apache.lucene.analysis.tokenattributes.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* @deprecated Remove this test in Lucene 5.0
|
||||||
*/
|
*/
|
||||||
public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
@Deprecated
|
||||||
|
public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
static List<String> strings(String str) {
|
static List<String> strings(String str) {
|
||||||
String[] arr = str.split(" ");
|
String[] arr = str.split(" ");
|
||||||
return Arrays.asList(arr);
|
return Arrays.asList(arr);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void assertTokenizesTo(SynonymMap dict, String input,
|
static void assertTokenizesTo(SlowSynonymMap dict, String input,
|
||||||
String expected[]) throws IOException {
|
String expected[]) throws IOException {
|
||||||
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
|
||||||
assertTokenStreamContents(stream, expected);
|
assertTokenStreamContents(stream, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void assertTokenizesTo(SynonymMap dict, String input,
|
static void assertTokenizesTo(SlowSynonymMap dict, String input,
|
||||||
String expected[], int posIncs[]) throws IOException {
|
String expected[], int posIncs[]) throws IOException {
|
||||||
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
|
||||||
assertTokenStreamContents(stream, expected, posIncs);
|
assertTokenStreamContents(stream, expected, posIncs);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
|
static void assertTokenizesTo(SlowSynonymMap dict, List<Token> input,
|
||||||
String expected[], int posIncs[])
|
String expected[], int posIncs[])
|
||||||
throws IOException {
|
throws IOException {
|
||||||
TokenStream tokenizer = new IterTokenStream(input);
|
TokenStream tokenizer = new IterTokenStream(input);
|
||||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
|
||||||
assertTokenStreamContents(stream, expected, posIncs);
|
assertTokenStreamContents(stream, expected, posIncs);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
|
static void assertTokenizesTo(SlowSynonymMap dict, List<Token> input,
|
||||||
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
|
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
|
||||||
throws IOException {
|
throws IOException {
|
||||||
TokenStream tokenizer = new IterTokenStream(input);
|
TokenStream tokenizer = new IterTokenStream(input);
|
||||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
|
||||||
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
|
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
|
||||||
posIncs);
|
posIncs);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMatching() throws IOException {
|
public void testMatching() throws IOException {
|
||||||
SynonymMap map = new SynonymMap();
|
SlowSynonymMap map = new SlowSynonymMap();
|
||||||
|
|
||||||
boolean orig = false;
|
boolean orig = false;
|
||||||
boolean merge = true;
|
boolean merge = true;
|
||||||
|
@ -110,7 +111,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testIncludeOrig() throws IOException {
|
public void testIncludeOrig() throws IOException {
|
||||||
SynonymMap map = new SynonymMap();
|
SlowSynonymMap map = new SlowSynonymMap();
|
||||||
|
|
||||||
boolean orig = true;
|
boolean orig = true;
|
||||||
boolean merge = true;
|
boolean merge = true;
|
||||||
|
@ -167,7 +168,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
|
||||||
public void testMapMerge() throws IOException {
|
public void testMapMerge() throws IOException {
|
||||||
SynonymMap map = new SynonymMap();
|
SlowSynonymMap map = new SlowSynonymMap();
|
||||||
|
|
||||||
boolean orig = false;
|
boolean orig = false;
|
||||||
boolean merge = true;
|
boolean merge = true;
|
||||||
|
@ -206,7 +207,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
|
||||||
public void testOverlap() throws IOException {
|
public void testOverlap() throws IOException {
|
||||||
SynonymMap map = new SynonymMap();
|
SlowSynonymMap map = new SlowSynonymMap();
|
||||||
|
|
||||||
boolean orig = false;
|
boolean orig = false;
|
||||||
boolean merge = true;
|
boolean merge = true;
|
||||||
|
@ -229,7 +230,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPositionIncrements() throws IOException {
|
public void testPositionIncrements() throws IOException {
|
||||||
SynonymMap map = new SynonymMap();
|
SlowSynonymMap map = new SlowSynonymMap();
|
||||||
|
|
||||||
boolean orig = false;
|
boolean orig = false;
|
||||||
boolean merge = true;
|
boolean merge = true;
|
||||||
|
@ -264,7 +265,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
|
||||||
public void testPositionIncrementsWithOrig() throws IOException {
|
public void testPositionIncrementsWithOrig() throws IOException {
|
||||||
SynonymMap map = new SynonymMap();
|
SlowSynonymMap map = new SlowSynonymMap();
|
||||||
|
|
||||||
boolean orig = true;
|
boolean orig = true;
|
||||||
boolean merge = true;
|
boolean merge = true;
|
||||||
|
@ -304,7 +305,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
||||||
// x=>y
|
// x=>y
|
||||||
// analysing "a x" causes "y" to have a bad offset (end less than start)
|
// analysing "a x" causes "y" to have a bad offset (end less than start)
|
||||||
// SOLR-167
|
// SOLR-167
|
||||||
SynonymMap map = new SynonymMap();
|
SlowSynonymMap map = new SlowSynonymMap();
|
||||||
|
|
||||||
boolean orig = false;
|
boolean orig = false;
|
||||||
boolean merge = true;
|
boolean merge = true;
|
|
@ -0,0 +1,62 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
|
||||||
|
public class TestSynonymFilterFactory extends BaseTokenTestCase {
|
||||||
|
/** test that we can parse and use the solr syn file */
|
||||||
|
public void testSynonyms() throws Exception {
|
||||||
|
SynonymFilterFactory factory = new SynonymFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.putAll(DEFAULT_VERSION_PARAM);
|
||||||
|
args.put("synonyms", "synonyms.txt");
|
||||||
|
factory.init(args);
|
||||||
|
factory.inform(new SolrResourceLoader(null, null));
|
||||||
|
TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
|
||||||
|
assertTrue(ts instanceof SynonymFilter);
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
new String[] { "GB", "gib", "gigabyte", "gigabytes" },
|
||||||
|
new int[] { 1, 0, 0, 0 });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test that we can parse and use the solr syn file, with the old impl
|
||||||
|
* @deprecated Remove this test in Lucene 5.0 */
|
||||||
|
@Deprecated
|
||||||
|
public void testSynonymsOld() throws Exception {
|
||||||
|
SynonymFilterFactory factory = new SynonymFilterFactory();
|
||||||
|
Map<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("luceneMatchVersion", Version.LUCENE_33.toString());
|
||||||
|
args.put("synonyms", "synonyms.txt");
|
||||||
|
factory.init(args);
|
||||||
|
factory.inform(new SolrResourceLoader(null, null));
|
||||||
|
TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
|
||||||
|
assertTrue(ts instanceof SlowSynonymFilter);
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
new String[] { "GB", "gib", "gigabyte", "gigabytes" },
|
||||||
|
new int[] { 1, 0, 0, 0 });
|
||||||
|
}
|
||||||
|
}
|
|
@ -25,32 +25,35 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Remove this test in Lucene 5.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
public class TestSynonymMap extends LuceneTestCase {
|
public class TestSynonymMap extends LuceneTestCase {
|
||||||
|
|
||||||
public void testInvalidMappingRules() throws Exception {
|
public void testInvalidMappingRules() throws Exception {
|
||||||
SynonymMap synMap = new SynonymMap( true );
|
SlowSynonymMap synMap = new SlowSynonymMap( true );
|
||||||
List<String> rules = new ArrayList<String>( 1 );
|
List<String> rules = new ArrayList<String>( 1 );
|
||||||
rules.add( "a=>b=>c" );
|
rules.add( "a=>b=>c" );
|
||||||
try{
|
try{
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
fail( "RuntimeException must be thrown." );
|
fail( "RuntimeException must be thrown." );
|
||||||
}
|
}
|
||||||
catch( RuntimeException expected ){}
|
catch( RuntimeException expected ){}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReadMappingRules() throws Exception {
|
public void testReadMappingRules() throws Exception {
|
||||||
SynonymMap synMap;
|
SlowSynonymMap synMap;
|
||||||
|
|
||||||
// (a)->[b]
|
// (a)->[b]
|
||||||
List<String> rules = new ArrayList<String>();
|
List<String> rules = new ArrayList<String>();
|
||||||
rules.add( "a=>b" );
|
rules.add( "a=>b" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
assertEquals( 1, synMap.submap.size() );
|
assertEquals( 1, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "b" );
|
assertTokIncludes( synMap, "a", "b" );
|
||||||
|
|
||||||
|
@ -58,8 +61,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
// (b)->[c]
|
// (b)->[c]
|
||||||
rules.clear();
|
rules.clear();
|
||||||
rules.add( "a,b=>c" );
|
rules.add( "a,b=>c" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
assertEquals( 2, synMap.submap.size() );
|
assertEquals( 2, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "c" );
|
assertTokIncludes( synMap, "a", "c" );
|
||||||
assertTokIncludes( synMap, "b", "c" );
|
assertTokIncludes( synMap, "b", "c" );
|
||||||
|
@ -67,8 +70,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
// (a)->[b][c]
|
// (a)->[b][c]
|
||||||
rules.clear();
|
rules.clear();
|
||||||
rules.add( "a=>b,c" );
|
rules.add( "a=>b,c" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
assertEquals( 1, synMap.submap.size() );
|
assertEquals( 1, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "b" );
|
assertTokIncludes( synMap, "a", "b" );
|
||||||
assertTokIncludes( synMap, "a", "c" );
|
assertTokIncludes( synMap, "a", "c" );
|
||||||
|
@ -78,8 +81,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
rules.clear();
|
rules.clear();
|
||||||
rules.add( "a=>a1" );
|
rules.add( "a=>a1" );
|
||||||
rules.add( "a b=>a2" );
|
rules.add( "a b=>a2" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
assertEquals( 1, synMap.submap.size() );
|
assertEquals( 1, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "a1" );
|
assertTokIncludes( synMap, "a", "a1" );
|
||||||
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
|
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
|
||||||
|
@ -92,8 +95,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
rules.add( "a=>a1" );
|
rules.add( "a=>a1" );
|
||||||
rules.add( "a b=>a2" );
|
rules.add( "a b=>a2" );
|
||||||
rules.add( "a c=>a3" );
|
rules.add( "a c=>a3" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
assertEquals( 1, synMap.submap.size() );
|
assertEquals( 1, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "a1" );
|
assertTokIncludes( synMap, "a", "a1" );
|
||||||
assertEquals( 2, getSubSynonymMap( synMap, "a" ).submap.size() );
|
assertEquals( 2, getSubSynonymMap( synMap, "a" ).submap.size() );
|
||||||
|
@ -109,8 +112,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
rules.add( "a b=>a2" );
|
rules.add( "a b=>a2" );
|
||||||
rules.add( "b=>b1" );
|
rules.add( "b=>b1" );
|
||||||
rules.add( "b c=>b2" );
|
rules.add( "b c=>b2" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
assertEquals( 2, synMap.submap.size() );
|
assertEquals( 2, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "a1" );
|
assertTokIncludes( synMap, "a", "a1" );
|
||||||
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
|
assertEquals( 1, getSubSynonymMap( synMap, "a" ).submap.size() );
|
||||||
|
@ -121,14 +124,14 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRead1waySynonymRules() throws Exception {
|
public void testRead1waySynonymRules() throws Exception {
|
||||||
SynonymMap synMap;
|
SlowSynonymMap synMap;
|
||||||
|
|
||||||
// (a)->[a]
|
// (a)->[a]
|
||||||
// (b)->[a]
|
// (b)->[a]
|
||||||
List<String> rules = new ArrayList<String>();
|
List<String> rules = new ArrayList<String>();
|
||||||
rules.add( "a,b" );
|
rules.add( "a,b" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||||
assertEquals( 2, synMap.submap.size() );
|
assertEquals( 2, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "a" );
|
assertTokIncludes( synMap, "a", "a" );
|
||||||
assertTokIncludes( synMap, "b", "a" );
|
assertTokIncludes( synMap, "b", "a" );
|
||||||
|
@ -138,8 +141,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
// (c)->[a]
|
// (c)->[a]
|
||||||
rules.clear();
|
rules.clear();
|
||||||
rules.add( "a,b,c" );
|
rules.add( "a,b,c" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||||
assertEquals( 3, synMap.submap.size() );
|
assertEquals( 3, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "a" );
|
assertTokIncludes( synMap, "a", "a" );
|
||||||
assertTokIncludes( synMap, "b", "a" );
|
assertTokIncludes( synMap, "b", "a" );
|
||||||
|
@ -149,8 +152,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
// (b1)->(b2)->[a]
|
// (b1)->(b2)->[a]
|
||||||
rules.clear();
|
rules.clear();
|
||||||
rules.add( "a,b1 b2" );
|
rules.add( "a,b1 b2" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||||
assertEquals( 2, synMap.submap.size() );
|
assertEquals( 2, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "a" );
|
assertTokIncludes( synMap, "a", "a" );
|
||||||
assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() );
|
assertEquals( 1, getSubSynonymMap( synMap, "b1" ).submap.size() );
|
||||||
|
@ -160,8 +163,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
// (b)->[a1][a2]
|
// (b)->[a1][a2]
|
||||||
rules.clear();
|
rules.clear();
|
||||||
rules.add( "a1 a2,b" );
|
rules.add( "a1 a2,b" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", false, null);
|
||||||
assertEquals( 2, synMap.submap.size() );
|
assertEquals( 2, synMap.submap.size() );
|
||||||
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
|
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
|
||||||
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
|
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
|
||||||
|
@ -171,14 +174,14 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRead2waySynonymRules() throws Exception {
|
public void testRead2waySynonymRules() throws Exception {
|
||||||
SynonymMap synMap;
|
SlowSynonymMap synMap;
|
||||||
|
|
||||||
// (a)->[a][b]
|
// (a)->[a][b]
|
||||||
// (b)->[a][b]
|
// (b)->[a][b]
|
||||||
List<String> rules = new ArrayList<String>();
|
List<String> rules = new ArrayList<String>();
|
||||||
rules.add( "a,b" );
|
rules.add( "a,b" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
assertEquals( 2, synMap.submap.size() );
|
assertEquals( 2, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "a" );
|
assertTokIncludes( synMap, "a", "a" );
|
||||||
assertTokIncludes( synMap, "a", "b" );
|
assertTokIncludes( synMap, "a", "b" );
|
||||||
|
@ -190,8 +193,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
// (c)->[a][b][c]
|
// (c)->[a][b][c]
|
||||||
rules.clear();
|
rules.clear();
|
||||||
rules.add( "a,b,c" );
|
rules.add( "a,b,c" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
assertEquals( 3, synMap.submap.size() );
|
assertEquals( 3, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "a" );
|
assertTokIncludes( synMap, "a", "a" );
|
||||||
assertTokIncludes( synMap, "a", "b" );
|
assertTokIncludes( synMap, "a", "b" );
|
||||||
|
@ -209,8 +212,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
// [b1][b2]
|
// [b1][b2]
|
||||||
rules.clear();
|
rules.clear();
|
||||||
rules.add( "a,b1 b2" );
|
rules.add( "a,b1 b2" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
assertEquals( 2, synMap.submap.size() );
|
assertEquals( 2, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "a" );
|
assertTokIncludes( synMap, "a", "a" );
|
||||||
assertTokIncludes( synMap, "a", "b1" );
|
assertTokIncludes( synMap, "a", "b1" );
|
||||||
|
@ -226,8 +229,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
// [b]
|
// [b]
|
||||||
rules.clear();
|
rules.clear();
|
||||||
rules.add( "a1 a2,b" );
|
rules.add( "a1 a2,b" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, null);
|
||||||
assertEquals( 2, synMap.submap.size() );
|
assertEquals( 2, synMap.submap.size() );
|
||||||
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
|
assertEquals( 1, getSubSynonymMap( synMap, "a1" ).submap.size() );
|
||||||
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
|
assertTokIncludes( getSubSynonymMap( synMap, "a1" ), "a2", "a1" );
|
||||||
|
@ -239,7 +242,7 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBigramTokenizer() throws Exception {
|
public void testBigramTokenizer() throws Exception {
|
||||||
SynonymMap synMap;
|
SlowSynonymMap synMap;
|
||||||
|
|
||||||
// prepare bi-gram tokenizer factory
|
// prepare bi-gram tokenizer factory
|
||||||
BaseTokenizerFactory tf = new NGramTokenizerFactory();
|
BaseTokenizerFactory tf = new NGramTokenizerFactory();
|
||||||
|
@ -251,8 +254,8 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
// (ab)->(bc)->(cd)->[ef][fg][gh]
|
// (ab)->(bc)->(cd)->[ef][fg][gh]
|
||||||
List<String> rules = new ArrayList<String>();
|
List<String> rules = new ArrayList<String>();
|
||||||
rules.add( "abcd=>efgh" );
|
rules.add( "abcd=>efgh" );
|
||||||
synMap = new SynonymMap( true );
|
synMap = new SlowSynonymMap( true );
|
||||||
SynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
|
SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf);
|
||||||
assertEquals( 1, synMap.submap.size() );
|
assertEquals( 1, synMap.submap.size() );
|
||||||
assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
|
assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() );
|
||||||
assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
|
assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() );
|
||||||
|
@ -265,7 +268,7 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
public void testLoadRules() throws Exception {
|
public void testLoadRules() throws Exception {
|
||||||
Map<String, String> args = new HashMap<String, String>();
|
Map<String, String> args = new HashMap<String, String>();
|
||||||
args.put( "synonyms", "something.txt" );
|
args.put( "synonyms", "something.txt" );
|
||||||
SynonymFilterFactory ff = new SynonymFilterFactory();
|
SlowSynonymFilterFactory ff = new SlowSynonymFilterFactory();
|
||||||
ff.init(args);
|
ff.init(args);
|
||||||
ff.inform( new ResourceLoader() {
|
ff.inform( new ResourceLoader() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -289,7 +292,7 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
SynonymMap synMap = ff.getSynonymMap();
|
SlowSynonymMap synMap = ff.getSynonymMap();
|
||||||
assertEquals( 2, synMap.submap.size() );
|
assertEquals( 2, synMap.submap.size() );
|
||||||
assertTokIncludes( synMap, "a", "a" );
|
assertTokIncludes( synMap, "a", "a" );
|
||||||
assertTokIncludes( synMap, "a", "b" );
|
assertTokIncludes( synMap, "a", "b" );
|
||||||
|
@ -298,7 +301,7 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void assertTokIncludes( SynonymMap map, String src, String exp ) throws Exception {
|
private void assertTokIncludes( SlowSynonymMap map, String src, String exp ) throws Exception {
|
||||||
Token[] tokens = map.submap.get( src ).synonyms;
|
Token[] tokens = map.submap.get( src ).synonyms;
|
||||||
boolean inc = false;
|
boolean inc = false;
|
||||||
for( Token token : tokens ){
|
for( Token token : tokens ){
|
||||||
|
@ -308,7 +311,7 @@ public class TestSynonymMap extends LuceneTestCase {
|
||||||
assertTrue( inc );
|
assertTrue( inc );
|
||||||
}
|
}
|
||||||
|
|
||||||
private SynonymMap getSubSynonymMap( SynonymMap map, String src ){
|
private SlowSynonymMap getSubSynonymMap( SlowSynonymMap map, String src ){
|
||||||
return map.submap.get( src );
|
return map.submap.get( src );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue