From 4b3032e9e6c18388d257af9ddc4c1e9623127701 Mon Sep 17 00:00:00 2001 From: Koji Sekiguchi Date: Wed, 14 Sep 2011 13:01:40 +0000 Subject: [PATCH] LUCENE-3426: add NGramPhraseQuery git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1170586 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 8 ++ .../lucene/search/NGramPhraseQuery.java | 98 +++++++++++++++++++ .../lucene/search/TestNGramPhraseQuery.java | 92 +++++++++++++++++ 3 files changed, 198 insertions(+) create mode 100644 lucene/src/java/org/apache/lucene/search/NGramPhraseQuery.java create mode 100644 lucene/src/test/org/apache/lucene/search/TestNGramPhraseQuery.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f7b343c2643..cfc0401cd3e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -589,6 +589,14 @@ Bug fixes should ignore the maxMergedSegmentMB setting (v.sevel via Mike McCandless) +New Features + +Optimizations + +* LUCENE-3426: Add NGramPhraseQuery which extends PhraseQuery and tries to reduce + the number of terms of the query when rewrite(), in order to improve performance. + (Robert Muir, Koji Sekiguchi) + ======================= Lucene 3.4.0 ======================= Bug fixes diff --git a/lucene/src/java/org/apache/lucene/search/NGramPhraseQuery.java b/lucene/src/java/org/apache/lucene/search/NGramPhraseQuery.java new file mode 100644 index 00000000000..cec4e16fb2b --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/NGramPhraseQuery.java @@ -0,0 +1,98 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +/** + * This is a {@link PhraseQuery} which is optimized for n-gram phrase query. + * For example, when you query "ABCD" on a 2-gram field, you may want to use + * NGramPhraseQuery rather than {@link PhraseQuery}, because NGramPhraseQuery + * will {@link #rewrite(IndexReader)} the query to "AB/0 CD/2", while {@link PhraseQuery} + * will query "AB/0 BC/1 CD/2" (where term/position). + * + */ +public class NGramPhraseQuery extends PhraseQuery { + private final int n; + + /** + * Constructor that takes gram size. + * @param n + */ + public NGramPhraseQuery(int n){ + super(); + this.n = n; + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + if(getSlop() != 0) return super.rewrite(reader); + + // check whether optimizable or not + if(n < 2 || // non-overlap n-gram cannot be optimized + getTerms().length < 3) // too short to optimize + return super.rewrite(reader); + + // check all posIncrement is 1 + // if not, cannot optimize + int[] positions = getPositions(); + Term[] terms = getTerms(); + int prevPosition = positions[0]; + for(int i = 1; i < positions.length; i++){ + int pos = positions[i]; + if(prevPosition + 1 != pos) return super.rewrite(reader); + prevPosition = pos; + } + + // now create the new optimized phrase query for n-gram + PhraseQuery optimized = new PhraseQuery(); + int pos = 0; + final int lastPos = terms.length - 1; + for(int i = 0; i < terms.length; i++){ + if(pos % n == 0 || pos >= lastPos){ + optimized.add(terms[i], positions[i]); + } + pos++; + } + + return optimized; + } + + /** Returns true iff o is equal to this. */ + @Override + public boolean equals(Object o) { + if (!(o instanceof NGramPhraseQuery)) + return false; + NGramPhraseQuery other = (NGramPhraseQuery)o; + if(this.n != other.n) return false; + return super.equals(other); + } + + /** Returns a hash code value for this object.*/ + @Override + public int hashCode() { + return Float.floatToIntBits(getBoost()) + ^ getSlop() + ^ getTerms().hashCode() + ^ getPositions().hashCode() + ^ n; + } +} diff --git a/lucene/src/test/org/apache/lucene/search/TestNGramPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestNGramPhraseQuery.java new file mode 100644 index 00000000000..7a28963e9bf --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestNGramPhraseQuery.java @@ -0,0 +1,92 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +public class TestNGramPhraseQuery extends LuceneTestCase { + + private static IndexReader reader; + private static Directory directory; + + @BeforeClass + public static void beforeClass() throws Exception { + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, directory); + writer.close(); + reader = IndexReader.open(directory); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + reader = null; + directory.close(); + directory = null; + } + + public void testRewrite() throws Exception { + // bi-gram test ABC => AB/BC => AB/BC + PhraseQuery pq1 = new NGramPhraseQuery(2); + pq1.add(new Term("f", "AB")); + pq1.add(new Term("f", "BC")); + + Query q = pq1.rewrite(reader); + assertTrue(q instanceof NGramPhraseQuery); + assertSame(pq1, q); + pq1 = (NGramPhraseQuery)q; + assertArrayEquals(new Term[]{new Term("f", "AB"), new Term("f", "BC")}, pq1.getTerms()); + assertArrayEquals(new int[]{0, 1}, pq1.getPositions()); + + // bi-gram test ABCD => AB/BC/CD => AB//CD + PhraseQuery pq2 = new NGramPhraseQuery(2); + pq2.add(new Term("f", "AB")); + pq2.add(new Term("f", "BC")); + pq2.add(new Term("f", "CD")); + + q = pq2.rewrite(reader); + assertTrue(q instanceof PhraseQuery); + assertNotSame(pq2, q); + pq2 = (PhraseQuery)q; + assertArrayEquals(new Term[]{new Term("f", "AB"), new Term("f", "CD")}, pq2.getTerms()); + assertArrayEquals(new int[]{0, 2}, pq2.getPositions()); + + // tri-gram test ABCDEFGH => ABC/BCD/CDE/DEF/EFG/FGH => ABC///DEF//FGH + PhraseQuery pq3 = new NGramPhraseQuery(3); + pq3.add(new Term("f", "ABC")); + pq3.add(new Term("f", "BCD")); + pq3.add(new Term("f", "CDE")); + pq3.add(new Term("f", "DEF")); + pq3.add(new Term("f", "EFG")); + pq3.add(new Term("f", "FGH")); + + q = pq3.rewrite(reader); + assertTrue(q instanceof PhraseQuery); + assertNotSame(pq3, q); + pq3 = (PhraseQuery)q; + assertArrayEquals(new Term[]{new Term("f", "ABC"), new Term("f", "DEF"), new Term("f", "FGH")}, pq3.getTerms()); + assertArrayEquals(new int[]{0, 3, 5}, pq3.getPositions()); + } + +}