- Classes that provide support for queries such as "microsoft app*".

PR: Obtained from: Submitted by: Anders Nielsen Reviewed by: otis git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149820 13f79535-47bb-0310-9956-ffa450edef68
2002-07-18 14:39:58 +00:00 · 2002-07-18 14:39:58 +00:00 · 15bbd8def8
parent d06a16f630
commit 15bbd8def8
3 changed files with 700 additions and 0 deletions
--- a/src/java/org/apache/lucene/index/MultipleTermPositions.java
+++ b/src/java/org/apache/lucene/index/MultipleTermPositions.java
@ -0,0 +1,314 @@
+package org.apache.lucene.index;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.util.PriorityQueue;
+
+
+/**
+ * Describe class <code>MultipleTermPositions</code> here.
+ *
+ * @author Anders Nielsen
+ * @version 1.0
+ */
+public class MultipleTermPositions
+    implements TermPositions
+{
+    private final static class TermPositionsQueue
+	extends PriorityQueue
+    {
+	TermPositionsQueue(List termPositions)
+	    throws IOException
+	{
+	    initialize(termPositions.size());
+
+	    Iterator i = termPositions.iterator();
+	    while (i.hasNext())
+	    {
+		TermPositions tp = (TermPositions)i.next();
+		if (tp.next())
+		    put(tp);
+	    }
+	}
+
+	final TermPositions peek()
+	{
+	    return (TermPositions)top();
+	}
+
+	public final boolean lessThan(Object a, Object b)
+	{
+	    return ((TermPositions)a).doc() < ((TermPositions)b).doc();
+	}
+    }
+
+    private final static class IntQueue
+    {
+	private int _arraySize = 16;
+
+	private int _index = 0;
+	private int _lastIndex = 0;
+
+	private int[] _array = new int[_arraySize];
+
+	final void add(int i)
+	{
+	    if (_lastIndex == _arraySize)
+		growArray();
+
+	    _array[_lastIndex++] = i;
+	}
+
+	final int next()
+	{
+	    return _array[_index++];
+	}
+
+	final void sort()
+	{
+	    Arrays.sort(_array, _index, _lastIndex);
+	}
+
+	final void clear()
+	{
+	    _index = 0;
+	    _lastIndex = 0;
+	}
+
+	final int size()
+	{
+	    return (_lastIndex-_index);
+	}
+
+	private void growArray()
+	{
+	    int[] newArray = new int[_arraySize*2];
+	    System.arraycopy(_array, 0, newArray, 0, _arraySize);
+	    _array = newArray;
+	    _arraySize *= 2;
+	}
+    }
+
+    private int _doc;
+    private int _freq;
+
+    private TermPositionsQueue _termPositionsQueue;
+    private IntQueue _posList;
+
+    /**
+     * Creates a new <code>MultipleTermPositions</code> instance.
+     *
+     * @param indexReader an <code>IndexReader</code> value
+     * @param terms a <code>Term[]</code> value
+     * @exception IOException if an error occurs
+     */
+    public MultipleTermPositions(IndexReader indexReader, Term[] terms)
+	throws IOException
+    {
+	List termPositions = new LinkedList();
+
+	for (int i=0; i<terms.length; i++)
+	    termPositions.add(indexReader.termPositions(terms[i]));
+
+	_termPositionsQueue = new TermPositionsQueue(termPositions);
+	_posList = new IntQueue();
+    }
+
+    /**
+     * Describe <code>next</code> method here.
+     *
+     * @return a <code>boolean</code> value
+     * @exception IOException if an error occurs
+     * @see TermDocs#next()
+     */
+    public final boolean next()
+	throws IOException
+    {
+	if (_termPositionsQueue.size() == 0)
+	    return false;
+
+	_posList.clear();
+	_doc = _termPositionsQueue.peek().doc();
+
+	TermPositions tp;
+	do
+	{
+	    tp = _termPositionsQueue.peek();
+
+	    for (int i=0; i<tp.freq(); i++)
+		_posList.add(tp.nextPosition());
+
+	    if (tp.next())
+		_termPositionsQueue.adjustTop();
+	    else
+	    {
+		_termPositionsQueue.pop();
+		tp.close();
+	    }
+	}
+	while (_termPositionsQueue.size() > 0 && _termPositionsQueue.peek().doc() == _doc);
+
+	_posList.sort();
+	_freq = _posList.size();
+
+	return true;
+    }
+
+    /**
+     * Describe <code>nextPosition</code> method here.
+     *
+     * @return an <code>int</code> value
+     * @exception IOException if an error occurs
+     * @see TermPositions#nextPosition()
+     */
+    public final int nextPosition()
+	throws IOException
+    {
+	return _posList.next();
+    }
+
+    /**
+     * Describe <code>skipTo</code> method here.
+     *
+     * @param target an <code>int</code> value
+     * @return a <code>boolean</code> value
+     * @exception IOException if an error occurs
+     * @see TermDocs#skipTo(int)
+     */
+    public final boolean skipTo(int target)
+	throws IOException
+    {
+	while (target > _termPositionsQueue.peek().doc())
+	{
+	    TermPositions tp = (TermPositions)_termPositionsQueue.pop();
+
+	    if (tp.skipTo(target))
+		_termPositionsQueue.put(tp);
+	    else
+		tp.close();
+	}
+
+	return next();
+    }
+
+    /**
+     * Describe <code>doc</code> method here.
+     *
+     * @return an <code>int</code> value
+     * @see TermDocs#doc()
+     */
+    public final int doc()
+    {
+	return _doc;
+    }
+
+    /**
+     * Describe <code>freq</code> method here.
+     *
+     * @return an <code>int</code> value
+     * @see TermDocs#freq()
+     */
+    public final int freq()
+    {
+	return _freq;
+    }
+
+    /**
+     * Describe <code>close</code> method here.
+     *
+     * @exception IOException if an error occurs
+     * @see TermDocs#close()
+     */
+    public final void close()
+	throws IOException
+    {
+	while (_termPositionsQueue.size() > 0)
+	    ((TermPositions)_termPositionsQueue.pop()).close();
+    }
+
+    /**
+     * Describe <code>seek</code> method here.
+     *
+     * @param arg0 a <code>Term</code> value
+     * @exception IOException if an error occurs
+     * @see TermDocs#seek(Term)
+     */
+    public void seek(Term arg0)
+	throws IOException
+    {
+	throw new UnsupportedOperationException();
+    }
+
+    /**
+     * Describe <code>read</code> method here.
+     *
+     * @param arg0 an <code>int[]</code> value
+     * @param arg1 an <code>int[]</code> value
+     * @return an <code>int</code> value
+     * @exception IOException if an error occurs
+     * @see TermDocs#read(int[], int[])
+     */
+    public int read(int[] arg0, int[] arg1)
+	throws IOException
+    {
+	throw new UnsupportedOperationException();
+    }
+}
--- a/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
+++ b/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
@ -0,0 +1,252 @@
+package org.apache.lucene.search;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultipleTermPositions;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.search.Query;
+
+/**
+ * PhrasePrefixQuery is a generalized version of PhraseQuery, with an added
+ * method {@link add(Term[])}.
+ * To use this class, to search for the phrase "Microsoft app*" first use
+ * add(Term) on the term "Microsoft", then find all terms that has "app" as
+ * prefix using IndexReader.terms(Term), and use PhrasePrefixQuery.add(Term[]
+ * terms) to add them to the query.
+ *
+ * @author Anders Nielsen
+ * @version 1.0
+ */
+public class PhrasePrefixQuery
+    extends Query
+{
+    private String _field;
+    private ArrayList _termArrays = new ArrayList();
+
+    private float _idf = 0.0f;
+    private float _weight = 0.0f;
+
+    private int _slop = 0;
+
+    /**
+     * Creates a new <code>PhrasePrefixQuery</code> instance.
+     *
+     */
+    public PhrasePrefixQuery()
+    {
+    }
+
+    /**
+     * Describe <code>setSlop</code> method here.
+     *
+     * @param s an <code>int</code> value
+     */
+    public void setSlop(int s)
+    {
+	_slop = s;
+    }
+
+    /**
+     * Describe <code>getSlop</code> method here.
+     *
+     * @return an <code>int</code> value
+     */
+    public int getSlop()
+    {
+	return _slop;
+    }
+
+    /**
+     * Describe <code>add</code> method here.
+     *
+     * @param term a <code>Term</code> value
+     */
+    public void add(Term term)
+    {
+	add(new Term[]{term});
+    }
+
+    /**
+     * Describe <code>add</code> method here.
+     *
+     * @param terms a <code>Term[]</code> value
+     */
+    public void add(Term[] terms)
+    {
+	if (_termArrays.size() == 0)
+	    _field = terms[0].field();
+
+      	for (int i=0; i<terms.length; i++)
+	{
+	    if (terms[i].field() != _field)
+	    {
+		throw new IllegalArgumentException(
+		    "All phrase terms must be in the same field (" + _field + "): "
+		    + terms[i]);
+	    }
+	}
+
+	_termArrays.add(terms);
+    }
+
+    Scorer scorer(IndexReader reader)
+	throws IOException
+    {
+    	if (_termArrays.size() == 0)  // optimize zero-term case
+	    return null;
+
+	if (_termArrays.size() == 1)  // optimize one-term case
+	{
+	    Term[] terms = (Term[])_termArrays.get(0);
+
+	    BooleanQuery boq = new BooleanQuery();
+	    for (int i=0; i<terms.length; i++)
+		boq.add(new TermQuery(terms[i]), false, false);
+
+	    return boq.scorer(reader);
+    	}
+
+    	TermPositions[] tps = new TermPositions[_termArrays.size()];
+	for (int i=0; i<tps.length; i++)
+	{
+	    Term[] terms = (Term[])_termArrays.get(i);
+
+	    TermPositions p;
+	    if (terms.length > 1)
+		p = new MultipleTermPositions(reader, terms);
+	    else
+		p = reader.termPositions(terms[0]);
+
+	    if (p == null)
+		return null;
+
+	    tps[i] = p;
+	}
+
+	if (_slop == 0)
+	    return new ExactPhraseScorer(tps, reader.norms(_field), _weight);
+	else
+	    return new SloppyPhraseScorer(tps, _slop, reader.norms(_field), _weight);
+    }
+
+    float sumOfSquaredWeights(Searcher searcher)
+	throws IOException
+    {
+	Iterator i = _termArrays.iterator();
+	while (i.hasNext())
+	{
+	    Term[] terms = (Term[])i.next();
+	    for (int j=0; j<terms.length; j++)
+		_idf += Similarity.idf(terms[j], searcher);
+	}
+
+	_weight = _idf * boost;
+	return _weight * _weight;
+    }
+
+    void normalize(float norm)
+    {
+	_weight *= norm;
+	_weight *= _idf;
+    }
+
+    /**
+     * Describe <code>toString</code> method here.
+     *
+     * This method assumes that the first term in a array of terms is the
+     * prefix for the whole array. That might not necessarily be so.
+     *
+     * @param f a <code>String</code> value
+     * @return a <code>String</code> value
+     */
+    public final String toString(String f)
+    {
+	StringBuffer buffer = new StringBuffer();
+	if (!_field.equals(f))
+	{
+	    buffer.append(_field);
+	    buffer.append(":");
+	}
+
+	buffer.append("\"");
+	Iterator i = _termArrays.iterator();
+	while (i.hasNext())
+	{
+	    Term[] terms = (Term[])i.next();
+	    buffer.append(terms[0].text() + (terms.length > 0 ? "*" : ""));
+	}
+	buffer.append("\"");
+
+	if (_slop != 0)
+	{
+	    buffer.append("~");
+	    buffer.append(_slop);
+	}
+
+	if (boost != 1.0f)
+	{
+	    buffer.append("^");
+	    buffer.append(Float.toString(boost));
+	}
+
+	return buffer.toString();
+    }
+}
--- a/src/test/org/apache/lucene/search/TestPhrasePrefixQuery.java
+++ b/src/test/org/apache/lucene/search/TestPhrasePrefixQuery.java
@ -0,0 +1,134 @@
+package org.apache.lucene.search;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import junit.framework.TestCase;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+/**
+ * This class tests PhrasePrefixQuery class.
+ *
+ * @author Otis Gospodnetic
+ * @version $Id$
+ */
+public class TestPhrasePrefixQuery
+    extends TestCase
+{
+    public TestPhrasePrefixQuery(String name)
+    {
+	super(name);
+    }
+
+    /**
+     *
+     */
+    public void testPhrasePrefix()
+        throws IOException
+    {
+        RAMDirectory indexStore = new RAMDirectory();
+        IndexWriter writer = new IndexWriter(indexStore, new SimpleAnalyzer(), true);
+        Document doc1 = new Document();
+        Document doc2 = new Document();
+        Document doc3 = new Document();
+	Document doc4 = new Document();
+	doc1.add(Field.Text("body", "blueberry pie"));
+        doc2.add(Field.Text("body", "blueberry pizza"));
+        doc3.add(Field.Text("body", "blueberry chewing gum"));
+        doc4.add(Field.Text("body", "picadelly circus"));
+        writer.addDocument(doc1);
+        writer.addDocument(doc2);
+        writer.addDocument(doc3);
+        writer.addDocument(doc4);
+	writer.optimize();
+	writer.close();
+
+	IndexSearcher searcher = new IndexSearcher(indexStore);
+
+	PhrasePrefixQuery query1 = new PhrasePrefixQuery();
+	PhrasePrefixQuery query2 = new PhrasePrefixQuery();
+	query1.add(new Term("body", "blueberry"));
+	query2.add(new Term("body", "strawberry"));
+
+	LinkedList termsWithPrefix = new LinkedList();
+        IndexReader ir = IndexReader.open(indexStore);
+
+	// this TermEnum gives "picadelly", "pie" and "pizza".
+        TermEnum te = ir.terms(new Term("body", "pi*"));
+        do {
+            termsWithPrefix.add(te.term());
+        } while (te.next());
+	query1.add((Term[])termsWithPrefix.toArray(new Term[0]));
+	query2.add((Term[])termsWithPrefix.toArray(new Term[0]));
+
+	Hits result;
+	result = searcher.search(query1);
+	assertEquals(2, result.length());
+
+	result = searcher.search(query2);
+	assertEquals(0, result.length());
+    }
+}