make sure code works with WordNet2.0 (no problem) and add Query expansion, and comments

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151031 13f79535-47bb-0310-9956-ffa450edef68
2025-03-05 15:59:25 +00:00 · 2005-01-11 20:58:11 +00:00 · 2005-01-11 20:58:11 +00:00 · 9eca2a3160
commit 9eca2a3160
parent 9e7e1d417f
5 changed files with 284 additions and 40 deletions
--- a/sandbox/contributions/WordNet/build.xml
+++ b/sandbox/contributions/WordNet/build.xml
@ -29,6 +29,7 @@
    </java>
  </target>

+
  <target name="synonym" description="Find synonyms for word">
    <fail unless="synindex.exists">
      Index does not exist.
@ -49,4 +50,24 @@
    </java>
  </target>

+  <target name="expand" description="Perform synonym expansion on a query">
+    <fail unless="synindex.exists">
+      Index does not exist.
+    </fail>
+
+    <fail unless="query">
+      Must specify 'query' property.
+    </fail>
+    
+    <java classname="org.apache.lucene.wordnet.SynExpand">
+      <classpath>
+        <path refid="compile.classpath"/>
+        <pathelement location="${build.classes.dir}"/>
+      </classpath>
+
+      <arg file="${synindex.dir}"/>
+      <arg value="${query}"/>
+    </java>
+  </target>
+
 </project>
--- a/sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynExpand.java
+++ b/sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynExpand.java
@ -0,0 +1,127 @@
+package org.apache.lucene.wordnet;
+
+import org.apache.lucene.store.*;
+import org.apache.lucene.search.*;
+import org.apache.lucene.index.*;
+import org.apache.lucene.document.*;
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.standard.*;
+import java.io.*;
+import java.util.*;
+
+
+/**
+ * Expand a query by looking up synonyms for every term.
+ * You need to invoke {@see Syns2Index} first to build the synonym index.
+ *
+ * @see Syns2Index
+ */
+public final class SynExpand {
+
+	/**
+	 * Test driver for synonym expansion.
+	 * Uses boost factor of 0.9 for illustrative purposes.
+	 *
+	 * If you pass in the query "big dog" then it prints out:
+	 *
+	 * <code><pre>
+	 * Query: big adult^0.9 bad^0.9 bighearted^0.9 boastful^0.9 boastfully^0.9 bounteous^0.9 bountiful^0.9 braggy^0.9 crowing^0.9 freehanded^0.9 giving^0.9 grown^0.9 grownup^0.9 handsome^0.9 large^0.9 liberal^0.9 magnanimous^0.9 momentous^0.9 openhanded^0.9 prominent^0.9 swelled^0.9 vainglorious^0.9 vauntingly^0.9
+	 * dog andiron^0.9 blackguard^0.9 bounder^0.9 cad^0.9 chase^0.9 click^0.9 detent^0.9 dogtooth^0.9 firedog^0.9 frank^0.9 frankfurter^0.9 frump^0.9 heel^0.9 hotdog^0.9 hound^0.9 pawl^0.9 tag^0.9 tail^0.9 track^0.9 trail^0.9 weenie^0.9 wiener^0.9 wienerwurst^0.9
+	 * </pre></code>
+	 */
+	public static void main(String[] args) throws IOException
+	{
+		if (args.length != 2)
+		{
+			System.out.println(
+							   "java org.apache.lucene.wordnet.SynExpand <index path> <query>");
+		}
+
+		FSDirectory directory = FSDirectory.getDirectory(args[0], false);
+		IndexSearcher searcher = new IndexSearcher(directory);
+
+		String query = args[1];
+		String field = "contents";
+
+		Query q = expand( query, searcher, new StandardAnalyzer(), field, 0.9f);
+		System.out.println( "Query: " + q.toString( field));
+
+
+
+		searcher.close();
+		directory.close();
+	}
+
+
+	/**
+	 * Perform synonym expansion on a query.
+	 *
+	 * @param query users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser.
+	 *
+	 * @param syns a opened to the Lucene index you previously created with {@see Syns2Index}. The searcher is not closed or otherwise altered.
+	 *
+	 * @param a optional analyzer used to parse the users query else {@see StandardAnalzyer} is used
+	 *
+	 * @param field optional field name to search in or null if you want the default of "contents"
+	 *
+	 * @param boost optional boost applied to synonyms else no boost is applied
+	 *
+	 * @return the expanded Query
+	 */ 
+	public static Query expand( String query,
+								Searcher syns,
+								Analyzer a,
+								String field,
+								float boost)
+		throws IOException
+	{
+		Set already = new HashSet(); // avoid dups 
+		List top = new LinkedList(); // needs to be separately listed..
+		if ( field == null) field = "contents";
+		if ( a == null) a = new StandardAnalyzer();
+
+		// [1] Parse query into separate words so that when we expand we can avoid dups
+		TokenStream ts = a.tokenStream( field, new StringReader( query));
+		org.apache.lucene.analysis.Token t;
+		while ( (t = ts.next()) != null)
+		{
+			String word = t.termText();
+			if ( already.add( word))
+				top.add( word);
+		}
+		BooleanQuery tmp = new BooleanQuery();
+		
+		// [2] form query
+		Iterator it = top.iterator();
+		while ( it.hasNext())
+		{
+			// [2a] add to level words in
+			String word = (String) it.next();
+			TermQuery tq = new TermQuery( new Term( field, word));
+			tmp.add( tq, BooleanClause.Occur.SHOULD);
+
+			// [2b] add in unique synonums
+			Hits hits = syns.search( new TermQuery( new Term(Syns2Index.F_WORD, word)));
+			for (int i = 0; i < hits.length(); i++)
+			{
+				Document doc = hits.doc(i);
+				String[] values = doc.getValues( Syns2Index.F_SYN);
+				for ( int j = 0; j < values.length; j++)
+				{
+					String syn = values[ j];
+					if ( already.add( syn)) // avoid dups of top level words and synonyms
+					{
+						tq = new TermQuery( new Term( field, syn));
+						if ( boost > 0) // else keep normal 1.0
+							tq.setBoost( boost);
+						tmp.add( tq, BooleanClause.Occur.SHOULD); 
+					}
+				}
+			}
+		}
+
+
+		return tmp;
+	}
+								
+}
--- a/sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynLookup.java
+++ b/sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynLookup.java
@ -1,45 +1,114 @@
 package org.apache.lucene.wordnet;

-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.Hits;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.document.Document;
-import java.io.IOException;
+import org.apache.lucene.store.*;
+import org.apache.lucene.search.*;
+import org.apache.lucene.index.*;
+import org.apache.lucene.document.*;
+import org.apache.lucene.analysis.*;
+import java.io.*;
+import java.util.*;

+
+/**
+ * Test program to look up synonyms.
+ */
 public class SynLookup {

-  public static void main(String[] args) throws IOException {
-    if (args.length != 2) {
-      System.out.println(
-    "java org.apache.lucene.wordnet.SynLookup <index path> <word>");
-    }
+	public static void main(String[] args) throws IOException {
+		if (args.length != 2) {
+			System.out.println(
+							   "java org.apache.lucene.wordnet.SynLookup <index path> <word>");
+		}

-    FSDirectory directory = FSDirectory.getDirectory(args[0], false);
-    IndexSearcher searcher = new IndexSearcher(directory);
+		FSDirectory directory = FSDirectory.getDirectory(args[0], false);
+		IndexSearcher searcher = new IndexSearcher(directory);

-    String word = args[1];
-    Hits hits = searcher.search(
-      new TermQuery(new Term("word", word)));
+		String word = args[1];
+		Hits hits = searcher.search(
+									new TermQuery(new Term(Syns2Index.F_WORD, word)));

-    if (hits.length() == 0) {
-      System.out.println("No synonyms found for " + word);
-    } else {
-      System.out.println("Synonyms found for \"" + word + "\":");
-    }
+		if (hits.length() == 0) {
+			System.out.println("No synonyms found for " + word);
+		} else {
+			System.out.println("Synonyms found for \"" + word + "\":");
+		}

-    for (int i = 0; i < hits.length(); i++) {
-      Document doc = hits.doc(i);
+		for (int i = 0; i < hits.length(); i++) {
+			Document doc = hits.doc(i);

-      String[] values = doc.getValues("syn");
+			String[] values = doc.getValues(Syns2Index.F_SYN);

-      for (int j = 0; j < values.length; j++) {
-        System.out.println(values[j]);
-      }
-    }
+			for (int j = 0; j < values.length; j++) {
+				System.out.println(values[j]);
+			}
+		}

-    searcher.close();
-    directory.close();
-  }
+		searcher.close();
+		directory.close();
+	}
+
+
+	/**
+	 * Perform synonym expansion on a query.
+	 *
+	 * @param query
+	 * @param syns
+	 * @param a
+	 * @param field
+	 * @param boost
+	 */ 
+	public static Query expand( String query,
+								Searcher syns,
+								Analyzer a,
+								String field,
+								float boost)
+		throws IOException
+	{
+		Set already = new HashSet(); // avoid dups		
+		List top = new LinkedList(); // needs to be separately listed..
+
+		// [1] Parse query into separate words so that when we expand we can avoid dups
+		TokenStream ts = a.tokenStream( field, new StringReader( query));
+		org.apache.lucene.analysis.Token t;
+		while ( (t = ts.next()) != null)
+		{
+			String word = t.termText();
+			if ( already.add( word))
+				top.add( word);
+		}
+		BooleanQuery tmp = new BooleanQuery();
+		
+		// [2] form query
+		Iterator it = top.iterator();
+		while ( it.hasNext())
+		{
+			// [2a] add to level words in
+			String word = (String) it.next();
+			TermQuery tq = new TermQuery( new Term( field, word));
+			tmp.add( tq, BooleanClause.Occur.SHOULD);
+
+			// [2b] add in unique synonums
+			Hits hits = syns.search( new TermQuery( new Term(Syns2Index.F_WORD, word)));
+			for (int i = 0; i < hits.length(); i++)
+			{
+				Document doc = hits.doc(i);
+				String[] values = doc.getValues( Syns2Index.F_SYN);
+				for ( int j = 0; j < values.length; j++)
+				{
+					String syn = values[ j];
+					if ( already.add( syn))
+					{
+						tq = new TermQuery( new Term( field, syn));
+						if ( boost > 0) // else keep normal 1.0
+							tq.setBoost( boost);
+						tmp.add( tq, BooleanClause.Occur.SHOULD); 
+					}
+				}
+			}
+		}
+
+
+		return tmp;
+	}
+								
 }
--- a/sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/Syns2Index.java
+++ b/sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/Syns2Index.java
@ -23,8 +23,10 @@ import java.util.TreeSet;
 import java.util.TreeMap;

 /**
- * Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/~wn/obtain.shtml">WordNet prolog download</a>
- * into a Lucene index suitable for looking up synonyms and performing query expansion.
+ * Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
+ * into a Lucene index suitable for looking up synonyms and performing query expansion ({@see SynExpand#expand SynExpand.expand(...)}).
+ *
+ * This has been tested with WordNet 2.0.
 *
 * The index has fields named "word" ({@see #F_WORD})
 * and "syn" ({@see #F_SYN}).
@ -40,8 +42,7 @@ import java.util.TreeMap;
 * related meanings we don't do that here.
 * </p>
 *
- * This can take 8 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
- * If you boost the minMergeDocuments and mergeFactor of the index writer than you can get this down to under 4 minutes.
+ * This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
 *
 * @author Dave Spencer, dave&#064;searchmorph.com
 * @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
@ -76,7 +77,7 @@ public class Syns2Index
    private static final Analyzer ana = new StandardAnalyzer();

    /**
-     * Takes optional arg of prolog file name.
+     * Takes arg of prolog file name and index directory.
     */
    public static void main(String[] args)
        throws Throwable
@ -228,9 +229,10 @@ public class Syns2Index

        // override the specific index if it already exists
        IndexWriter writer = new IndexWriter(indexDir, ana, true);
-        writer.setUseCompoundFile(true);
-		writer.mergeFactor *= 2;
-		writer.minMergeDocs *= 2;
+        writer.setUseCompoundFile(true); // why?
+		// blindly up these parameters for speed
+		writer.setMergeFactor( writer.getMergeFactor() * 2);
+		writer.setMaxBufferedDocs( writer.getMaxBufferedDocs() * 2);
        Iterator i1 = word2Nums.keySet().iterator();
        while (i1.hasNext()) // for each word
        {
--- a/sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/package.html
+++ b/sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/package.html
@ -0,0 +1,25 @@
+<html>
+    <head>
+<title>WordNet Lucene Synonyms Integration</title>
+</head>
+<body>
+
+    This package uses synonyms defined by <a href="http://www.cogsci.princeton.edu/~wn/">WordNet</a> to build a
+    Lucene index storing them, which in turn can be used for query expansion.
+
+    You normally run {@see org.apache.lucene.wordnet.Syns2Index} once to build the query index/"database", and then call
+    {@see org.apache.lucene.wordnet.SynExpand#expand SynExpand.expand(...)} to expand a query.
+
+    <p>
+
+	<h3> Instructions </h3>
+	<ol>
+	    <li> Download the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog database</a> , gunzip, untar etc.
+	<li> Invoke Syn2Index as appropriate to build a synonym index.
+	    It'll take 2 arguments, the path to wn_s.pl from that WordNet downlaod, and the index name.
+   
+	 <li> Update your UI so that as appropriate you call SynExpand.expand(...) to expand user queries with synonyms.
+       </ol>
+
+</body>
+    </html>