mirror of https://github.com/apache/lucene.git
make sure code works with WordNet2.0 (no problem) and add Query expansion, and comments
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151031 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9e7e1d417f
commit
9eca2a3160
|
@ -29,6 +29,7 @@
|
||||||
</java>
|
</java>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
|
||||||
<target name="synonym" description="Find synonyms for word">
|
<target name="synonym" description="Find synonyms for word">
|
||||||
<fail unless="synindex.exists">
|
<fail unless="synindex.exists">
|
||||||
Index does not exist.
|
Index does not exist.
|
||||||
|
@ -49,4 +50,24 @@
|
||||||
</java>
|
</java>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<target name="expand" description="Perform synonym expansion on a query">
|
||||||
|
<fail unless="synindex.exists">
|
||||||
|
Index does not exist.
|
||||||
|
</fail>
|
||||||
|
|
||||||
|
<fail unless="query">
|
||||||
|
Must specify 'query' property.
|
||||||
|
</fail>
|
||||||
|
|
||||||
|
<java classname="org.apache.lucene.wordnet.SynExpand">
|
||||||
|
<classpath>
|
||||||
|
<path refid="compile.classpath"/>
|
||||||
|
<pathelement location="${build.classes.dir}"/>
|
||||||
|
</classpath>
|
||||||
|
|
||||||
|
<arg file="${synindex.dir}"/>
|
||||||
|
<arg value="${query}"/>
|
||||||
|
</java>
|
||||||
|
</target>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -0,0 +1,127 @@
|
||||||
|
package org.apache.lucene.wordnet;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.*;
|
||||||
|
import org.apache.lucene.search.*;
|
||||||
|
import org.apache.lucene.index.*;
|
||||||
|
import org.apache.lucene.document.*;
|
||||||
|
import org.apache.lucene.analysis.*;
|
||||||
|
import org.apache.lucene.analysis.standard.*;
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expand a query by looking up synonyms for every term.
|
||||||
|
* You need to invoke {@see Syns2Index} first to build the synonym index.
|
||||||
|
*
|
||||||
|
* @see Syns2Index
|
||||||
|
*/
|
||||||
|
public final class SynExpand {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test driver for synonym expansion.
|
||||||
|
* Uses boost factor of 0.9 for illustrative purposes.
|
||||||
|
*
|
||||||
|
* If you pass in the query "big dog" then it prints out:
|
||||||
|
*
|
||||||
|
* <code><pre>
|
||||||
|
* Query: big adult^0.9 bad^0.9 bighearted^0.9 boastful^0.9 boastfully^0.9 bounteous^0.9 bountiful^0.9 braggy^0.9 crowing^0.9 freehanded^0.9 giving^0.9 grown^0.9 grownup^0.9 handsome^0.9 large^0.9 liberal^0.9 magnanimous^0.9 momentous^0.9 openhanded^0.9 prominent^0.9 swelled^0.9 vainglorious^0.9 vauntingly^0.9
|
||||||
|
* dog andiron^0.9 blackguard^0.9 bounder^0.9 cad^0.9 chase^0.9 click^0.9 detent^0.9 dogtooth^0.9 firedog^0.9 frank^0.9 frankfurter^0.9 frump^0.9 heel^0.9 hotdog^0.9 hound^0.9 pawl^0.9 tag^0.9 tail^0.9 track^0.9 trail^0.9 weenie^0.9 wiener^0.9 wienerwurst^0.9
|
||||||
|
* </pre></code>
|
||||||
|
*/
|
||||||
|
public static void main(String[] args) throws IOException
|
||||||
|
{
|
||||||
|
if (args.length != 2)
|
||||||
|
{
|
||||||
|
System.out.println(
|
||||||
|
"java org.apache.lucene.wordnet.SynExpand <index path> <query>");
|
||||||
|
}
|
||||||
|
|
||||||
|
FSDirectory directory = FSDirectory.getDirectory(args[0], false);
|
||||||
|
IndexSearcher searcher = new IndexSearcher(directory);
|
||||||
|
|
||||||
|
String query = args[1];
|
||||||
|
String field = "contents";
|
||||||
|
|
||||||
|
Query q = expand( query, searcher, new StandardAnalyzer(), field, 0.9f);
|
||||||
|
System.out.println( "Query: " + q.toString( field));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
searcher.close();
|
||||||
|
directory.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform synonym expansion on a query.
|
||||||
|
*
|
||||||
|
* @param query users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser.
|
||||||
|
*
|
||||||
|
* @param syns a opened to the Lucene index you previously created with {@see Syns2Index}. The searcher is not closed or otherwise altered.
|
||||||
|
*
|
||||||
|
* @param a optional analyzer used to parse the users query else {@see StandardAnalzyer} is used
|
||||||
|
*
|
||||||
|
* @param field optional field name to search in or null if you want the default of "contents"
|
||||||
|
*
|
||||||
|
* @param boost optional boost applied to synonyms else no boost is applied
|
||||||
|
*
|
||||||
|
* @return the expanded Query
|
||||||
|
*/
|
||||||
|
public static Query expand( String query,
|
||||||
|
Searcher syns,
|
||||||
|
Analyzer a,
|
||||||
|
String field,
|
||||||
|
float boost)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
Set already = new HashSet(); // avoid dups
|
||||||
|
List top = new LinkedList(); // needs to be separately listed..
|
||||||
|
if ( field == null) field = "contents";
|
||||||
|
if ( a == null) a = new StandardAnalyzer();
|
||||||
|
|
||||||
|
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||||
|
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
||||||
|
org.apache.lucene.analysis.Token t;
|
||||||
|
while ( (t = ts.next()) != null)
|
||||||
|
{
|
||||||
|
String word = t.termText();
|
||||||
|
if ( already.add( word))
|
||||||
|
top.add( word);
|
||||||
|
}
|
||||||
|
BooleanQuery tmp = new BooleanQuery();
|
||||||
|
|
||||||
|
// [2] form query
|
||||||
|
Iterator it = top.iterator();
|
||||||
|
while ( it.hasNext())
|
||||||
|
{
|
||||||
|
// [2a] add to level words in
|
||||||
|
String word = (String) it.next();
|
||||||
|
TermQuery tq = new TermQuery( new Term( field, word));
|
||||||
|
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
||||||
|
|
||||||
|
// [2b] add in unique synonums
|
||||||
|
Hits hits = syns.search( new TermQuery( new Term(Syns2Index.F_WORD, word)));
|
||||||
|
for (int i = 0; i < hits.length(); i++)
|
||||||
|
{
|
||||||
|
Document doc = hits.doc(i);
|
||||||
|
String[] values = doc.getValues( Syns2Index.F_SYN);
|
||||||
|
for ( int j = 0; j < values.length; j++)
|
||||||
|
{
|
||||||
|
String syn = values[ j];
|
||||||
|
if ( already.add( syn)) // avoid dups of top level words and synonyms
|
||||||
|
{
|
||||||
|
tq = new TermQuery( new Term( field, syn));
|
||||||
|
if ( boost > 0) // else keep normal 1.0
|
||||||
|
tq.setBoost( boost);
|
||||||
|
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,45 +1,114 @@
|
||||||
package org.apache.lucene.wordnet;
|
package org.apache.lucene.wordnet;
|
||||||
|
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.*;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.*;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.index.*;
|
||||||
import org.apache.lucene.search.Hits;
|
import org.apache.lucene.document.*;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.document.Document;
|
import java.io.*;
|
||||||
import java.io.IOException;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test program to look up synonyms.
|
||||||
|
*/
|
||||||
public class SynLookup {
|
public class SynLookup {
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
if (args.length != 2) {
|
if (args.length != 2) {
|
||||||
System.out.println(
|
System.out.println(
|
||||||
"java org.apache.lucene.wordnet.SynLookup <index path> <word>");
|
"java org.apache.lucene.wordnet.SynLookup <index path> <word>");
|
||||||
}
|
}
|
||||||
|
|
||||||
FSDirectory directory = FSDirectory.getDirectory(args[0], false);
|
FSDirectory directory = FSDirectory.getDirectory(args[0], false);
|
||||||
IndexSearcher searcher = new IndexSearcher(directory);
|
IndexSearcher searcher = new IndexSearcher(directory);
|
||||||
|
|
||||||
String word = args[1];
|
String word = args[1];
|
||||||
Hits hits = searcher.search(
|
Hits hits = searcher.search(
|
||||||
new TermQuery(new Term("word", word)));
|
new TermQuery(new Term(Syns2Index.F_WORD, word)));
|
||||||
|
|
||||||
if (hits.length() == 0) {
|
if (hits.length() == 0) {
|
||||||
System.out.println("No synonyms found for " + word);
|
System.out.println("No synonyms found for " + word);
|
||||||
} else {
|
} else {
|
||||||
System.out.println("Synonyms found for \"" + word + "\":");
|
System.out.println("Synonyms found for \"" + word + "\":");
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < hits.length(); i++) {
|
for (int i = 0; i < hits.length(); i++) {
|
||||||
Document doc = hits.doc(i);
|
Document doc = hits.doc(i);
|
||||||
|
|
||||||
String[] values = doc.getValues("syn");
|
String[] values = doc.getValues(Syns2Index.F_SYN);
|
||||||
|
|
||||||
for (int j = 0; j < values.length; j++) {
|
for (int j = 0; j < values.length; j++) {
|
||||||
System.out.println(values[j]);
|
System.out.println(values[j]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
searcher.close();
|
searcher.close();
|
||||||
directory.close();
|
directory.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform synonym expansion on a query.
|
||||||
|
*
|
||||||
|
* @param query
|
||||||
|
* @param syns
|
||||||
|
* @param a
|
||||||
|
* @param field
|
||||||
|
* @param boost
|
||||||
|
*/
|
||||||
|
public static Query expand( String query,
|
||||||
|
Searcher syns,
|
||||||
|
Analyzer a,
|
||||||
|
String field,
|
||||||
|
float boost)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
Set already = new HashSet(); // avoid dups
|
||||||
|
List top = new LinkedList(); // needs to be separately listed..
|
||||||
|
|
||||||
|
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||||
|
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
||||||
|
org.apache.lucene.analysis.Token t;
|
||||||
|
while ( (t = ts.next()) != null)
|
||||||
|
{
|
||||||
|
String word = t.termText();
|
||||||
|
if ( already.add( word))
|
||||||
|
top.add( word);
|
||||||
|
}
|
||||||
|
BooleanQuery tmp = new BooleanQuery();
|
||||||
|
|
||||||
|
// [2] form query
|
||||||
|
Iterator it = top.iterator();
|
||||||
|
while ( it.hasNext())
|
||||||
|
{
|
||||||
|
// [2a] add to level words in
|
||||||
|
String word = (String) it.next();
|
||||||
|
TermQuery tq = new TermQuery( new Term( field, word));
|
||||||
|
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
||||||
|
|
||||||
|
// [2b] add in unique synonums
|
||||||
|
Hits hits = syns.search( new TermQuery( new Term(Syns2Index.F_WORD, word)));
|
||||||
|
for (int i = 0; i < hits.length(); i++)
|
||||||
|
{
|
||||||
|
Document doc = hits.doc(i);
|
||||||
|
String[] values = doc.getValues( Syns2Index.F_SYN);
|
||||||
|
for ( int j = 0; j < values.length; j++)
|
||||||
|
{
|
||||||
|
String syn = values[ j];
|
||||||
|
if ( already.add( syn))
|
||||||
|
{
|
||||||
|
tq = new TermQuery( new Term( field, syn));
|
||||||
|
if ( boost > 0) // else keep normal 1.0
|
||||||
|
tq.setBoost( boost);
|
||||||
|
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,8 +23,10 @@ import java.util.TreeSet;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/~wn/obtain.shtml">WordNet prolog download</a>
|
* Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
|
||||||
* into a Lucene index suitable for looking up synonyms and performing query expansion.
|
* into a Lucene index suitable for looking up synonyms and performing query expansion ({@see SynExpand#expand SynExpand.expand(...)}).
|
||||||
|
*
|
||||||
|
* This has been tested with WordNet 2.0.
|
||||||
*
|
*
|
||||||
* The index has fields named "word" ({@see #F_WORD})
|
* The index has fields named "word" ({@see #F_WORD})
|
||||||
* and "syn" ({@see #F_SYN}).
|
* and "syn" ({@see #F_SYN}).
|
||||||
|
@ -40,8 +42,7 @@ import java.util.TreeMap;
|
||||||
* related meanings we don't do that here.
|
* related meanings we don't do that here.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* This can take 8 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
|
* This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
|
||||||
* If you boost the minMergeDocuments and mergeFactor of the index writer than you can get this down to under 4 minutes.
|
|
||||||
*
|
*
|
||||||
* @author Dave Spencer, dave@searchmorph.com
|
* @author Dave Spencer, dave@searchmorph.com
|
||||||
* @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
|
* @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
|
||||||
|
@ -76,7 +77,7 @@ public class Syns2Index
|
||||||
private static final Analyzer ana = new StandardAnalyzer();
|
private static final Analyzer ana = new StandardAnalyzer();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Takes optional arg of prolog file name.
|
* Takes arg of prolog file name and index directory.
|
||||||
*/
|
*/
|
||||||
public static void main(String[] args)
|
public static void main(String[] args)
|
||||||
throws Throwable
|
throws Throwable
|
||||||
|
@ -228,9 +229,10 @@ public class Syns2Index
|
||||||
|
|
||||||
// override the specific index if it already exists
|
// override the specific index if it already exists
|
||||||
IndexWriter writer = new IndexWriter(indexDir, ana, true);
|
IndexWriter writer = new IndexWriter(indexDir, ana, true);
|
||||||
writer.setUseCompoundFile(true);
|
writer.setUseCompoundFile(true); // why?
|
||||||
writer.mergeFactor *= 2;
|
// blindly up these parameters for speed
|
||||||
writer.minMergeDocs *= 2;
|
writer.setMergeFactor( writer.getMergeFactor() * 2);
|
||||||
|
writer.setMaxBufferedDocs( writer.getMaxBufferedDocs() * 2);
|
||||||
Iterator i1 = word2Nums.keySet().iterator();
|
Iterator i1 = word2Nums.keySet().iterator();
|
||||||
while (i1.hasNext()) // for each word
|
while (i1.hasNext()) // for each word
|
||||||
{
|
{
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>WordNet Lucene Synonyms Integration</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
This package uses synonyms defined by <a href="http://www.cogsci.princeton.edu/~wn/">WordNet</a> to build a
|
||||||
|
Lucene index storing them, which in turn can be used for query expansion.
|
||||||
|
|
||||||
|
You normally run {@see org.apache.lucene.wordnet.Syns2Index} once to build the query index/"database", and then call
|
||||||
|
{@see org.apache.lucene.wordnet.SynExpand#expand SynExpand.expand(...)} to expand a query.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
|
||||||
|
<h3> Instructions </h3>
|
||||||
|
<ol>
|
||||||
|
<li> Download the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog database</a> , gunzip, untar etc.
|
||||||
|
<li> Invoke Syn2Index as appropriate to build a synonym index.
|
||||||
|
It'll take 2 arguments, the path to wn_s.pl from that WordNet downlaod, and the index name.
|
||||||
|
|
||||||
|
<li> Update your UI so that as appropriate you call SynExpand.expand(...) to expand user queries with synonyms.
|
||||||
|
</ol>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in New Issue