From 3743037bc1d1b7e5822e203b240ef4def9bd651d Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Tue, 22 Sep 2009 21:55:57 +0000 Subject: [PATCH] SOLR-908: add CommonGramsFilterFactory CommonGramsQueryFilterFactory git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@817859 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 + .../solr/analysis/CommonGramsFilter.java | 223 +++++++++++++++ .../analysis/CommonGramsFilterFactory.java | 89 ++++++ .../solr/analysis/CommonGramsQueryFilter.java | 138 +++++++++ .../CommonGramsQueryFilterFactory.java | 98 +++++++ .../CommonGramsFilterFactoryTest.java | 69 +++++ .../solr/analysis/CommonGramsFilterTest.java | 265 ++++++++++++++++++ .../CommonGramsQueryFilterFactoryTest.java | 68 +++++ 8 files changed, 955 insertions(+) create mode 100644 src/java/org/apache/solr/analysis/CommonGramsFilter.java create mode 100644 src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java create mode 100644 src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java create mode 100644 src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java create mode 100644 src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java create mode 100644 src/test/org/apache/solr/analysis/CommonGramsFilterTest.java create mode 100644 src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java diff --git a/CHANGES.txt b/CHANGES.txt index cf96c0244cd..846c40f668b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -321,6 +321,11 @@ New Features 80. SOLR-1447 : Simple property injection. & syntaxes are deprecated ( Jason rutherglen noble) +82. SOLR-908 : CommonGramsFilterFactory/CommonGramsQueryFilterFactory for + speeding up phrase queries containing common words by indexing + n-grams and using them at query time. + (Tom Burton-West, Jason Rutherglen via yonik) + Optimizations ---------------------- 1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the diff --git a/src/java/org/apache/solr/analysis/CommonGramsFilter.java b/src/java/org/apache/solr/analysis/CommonGramsFilter.java new file mode 100644 index 00000000000..b31ebc206e4 --- /dev/null +++ b/src/java/org/apache/solr/analysis/CommonGramsFilter.java @@ -0,0 +1,223 @@ +/* + * Licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.solr.analysis; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Set; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +/* + * TODO: Rewrite to use new TokenStream api from lucene 2.9 when BufferedTokenStream uses it. + * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and + * associated constructors + */ + +/** + * Construct bigrams for frequently occurring terms while indexing. Single terms + * are still indexed too, with bigrams overlaid. This is achieved through the + * use of {@link Token#SetPositionIncrement(int)}. Bigrams have a type + * of "gram" Example + *
    + *
  • input:"the quick brown fox"
  • + *
  • output:|"the","the-quick"|"brown"|"fox"|
  • + *
  • "the-quick" has a position increment of 0 so it is in the same position + * as "the" "the-quick" has a term.type() of "gram"
  • + * + *
+ */ + +/* + * Constructors and makeCommonSet based on similar code in StopFilter + */ + +public class CommonGramsFilter extends BufferedTokenStream { + + private static final char SEPARATOR = '_'; + + private final CharArraySet commonWords; + + private StringBuilder buffer = new StringBuilder(); + + /** + * Construct a token stream filtering the given input using a Set of common + * words to create bigrams. Outputs both unigrams with position increment and + * bigrams with position increment 0 type=gram where one or both of the words + * in a potential bigram are in the set of common words . + * + * @param input TokenStream input in filter chain + * @param commonWords The set of common words. + * + */ + public CommonGramsFilter(TokenStream input, Set commonWords) { + this(input, commonWords, false); + } + + /** + * Construct a token stream filtering the given input using a Set of common + * words to create bigrams, case-sensitive if ignoreCase is false (unless Set + * is CharArraySet). If commonWords is an instance of + * {@link CharArraySet} (true if makeCommonSet() was used to + * construct the set) it will be directly used and ignoreCase + * will be ignored since CharArraySet directly controls case + * sensitivity. + *

+ * If commonWords is not an instance of {@link CharArraySet}, a + * new CharArraySet will be constructed and ignoreCase will be + * used to specify the case sensitivity of that set. + * + * @param input TokenStream input in filter chain. + * @param commonWords The set of common words. + * @param ignoreCase -Ignore case when constructing bigrams for common words. + */ + public CommonGramsFilter(TokenStream input, Set commonWords, + boolean ignoreCase) { + super(input); + if (commonWords instanceof CharArraySet) { + this.commonWords = (CharArraySet) commonWords; + } else { + this.commonWords = new CharArraySet(commonWords.size(), ignoreCase); + this.commonWords.addAll(commonWords); + } + init(); + } + + /** + * Construct a token stream filtering the given input using an Array of common + * words to create bigrams. + * + * @param input Tokenstream in filter chain + * @param commonWords words to be used in constructing bigrams + */ + public CommonGramsFilter(TokenStream input, String[] commonWords) { + this(input, commonWords, false); + init(); + } + + /** + * Construct a token stream filtering the given input using an Array of common + * words to create bigrams and is case-sensitive if ignoreCase is false. + * + * @param input Tokenstream in filter chain + * @param commonWords words to be used in constructing bigrams + * @param ignoreCase -Ignore case when constructing bigrams for common words. + */ + public CommonGramsFilter(TokenStream input, String[] commonWords, + boolean ignoreCase) { + super(input); + this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase); + init(); + } + + // Here for future moving to 2.9 api See StopFilter code + + public void init() { + /** + * termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt + * =(PositionIncrementAttribute) + * addAttribute(PositionIncrementAttribute.class); typeAdd =(TypeAttribute) + * addAttribute(TypeAttribute.class); + */ + } + + /** + * Build a CharArraySet from an array of common words, appropriate for passing + * into the CommonGramsFilter constructor. This permits this commonWords + * construction to be cached once when an Analyzer is constructed. + * + * @see #makeCommonSet(java.lang.String[], boolean) passing false to + * ignoreCase + */ + public static final CharArraySet makeCommonSet(String[] commonWords) { + return makeCommonSet(commonWords, false); + } + + /** + * Build a CharArraySet from an array of common words, appropriate for passing + * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is + * false. + * + * @param commonWords + * @param ignoreCase If true, all words are lower cased first. + * @return a Set containing the words + */ + public static final CharArraySet makeCommonSet(String[] commonWords, + boolean ignoreCase) { + CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase); + commonSet.addAll(Arrays.asList(commonWords)); + return commonSet; + } + + /** + * Inserts bigrams for common words into a token stream. For each input token, + * output the token. If the token and/or the following token are in the list + * of common words also output a bigram with position increment 0 and + * type="gram" + */ + /* + * TODO: implement new lucene 2.9 API incrementToken() instead of deprecated + * Token.next() TODO:Consider adding an option to not emit unigram stopwords + * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be + * changed to work with this. TODO: Consider optimizing for the case of three + * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of", + * "of-the", "the-year" but with proper management of positions we could + * eliminate the middle bigram "of-the"and save a disk seek and a whole set of + * position lookups. + */ + public Token process(Token token) throws IOException { + Token next = peek(1); + // if this is the last token just spit it out. Any commongram would have + // been output in the previous call + if (next == null) { + return token; + } + + /** + * if this token or next are common then construct a bigram with type="gram" + * position increment = 0, and put it in the output queue. It will be + * returned when super.next() is called, before this method gets called with + * a new token from the input stream See implementation of next() in + * BufferedTokenStream + */ + + if (isCommon(token) || isCommon(next)) { + Token gram = gramToken(token, next); + write(gram); + } + // we always return the unigram token + return token; + } + + /** True if token is for a common term. */ + private boolean isCommon(Token token) { + return commonWords != null + && commonWords.contains(token.termBuffer(), 0, token.termLength()); + } + + /** Construct a compound token. */ + private Token gramToken(Token first, Token second) { + buffer.setLength(0); + buffer.append(first.termText()); + buffer.append(SEPARATOR); + buffer.append(second.termText()); + Token result = new Token(buffer.toString(), first.startOffset(), second + .endOffset(), "gram"); + result.setPositionIncrement(0); + return result; + } + + public void reset() throws IOException { + super.reset(); + buffer.setLength(0); + } +} diff --git a/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java b/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java new file mode 100644 index 00000000000..f7820b6c140 --- /dev/null +++ b/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.analysis; + +import java.io.IOException; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.StopAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.util.plugin.ResourceLoaderAware; + +/** + * Constructs a CommonGramsFilter + */ + +/* + * This is pretty close to a straight copy from StopFilterFactory + */ +public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements + ResourceLoaderAware { + + public void inform(ResourceLoader loader) { + String commonWordFiles = args.get("words"); + ignoreCase = getBoolean("ignoreCase", false); + enablePositionIncrements = getBoolean("enablePositionIncrements", false); + + if (commonWordFiles != null) { + try { + List files = StrUtils.splitFileNames(commonWordFiles); + if (commonWords == null && files.size() > 0){ + //default stopwords list has 35 or so words, but maybe don't make it that big to start + commonWords = new CharArraySet(files.size() * 10, ignoreCase); + } + for (String file : files) { + List wlist = loader.getLines(file.trim()); + //TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call + commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase)); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } else { + commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase); + } + } + + //Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095 + private CharArraySet commonWords; + private boolean ignoreCase; + private boolean enablePositionIncrements; + + public boolean isEnablePositionIncrements() { + return enablePositionIncrements; + } + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public Set getCommonWords() { + return commonWords; + } + + public CommonGramsFilter create(TokenStream input) { + CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, ignoreCase); + return commonGrams; + } +} + + + \ No newline at end of file diff --git a/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java b/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java new file mode 100644 index 00000000000..65c3c9b5238 --- /dev/null +++ b/src/java/org/apache/solr/analysis/CommonGramsQueryFilter.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.analysis; + +import java.io.IOException; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.Token; + +/** + * Wrap a CommonGramsFilter optimizing phrase queries by only returning single + * words when they are not a member of a bigram. + * + * Example: + *

    + *
  • query input to CommonGramsFilter: "the rain in spain falls mainly" + *
  • output of CommomGramsFilter/input to CommonGramsQueryFilter: + * |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly" + *
  • output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain", + * "falls", "mainly" + *
+ */ + +/* + * TODO: When org.apache.solr.analysis.BufferedTokenStream is changed to use the + * 2.9 lucene TokenStream api, make necessary changes here. + * See:http://hudson.zones + * .apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache + * /lucene/analysis/TokenStream.html and + * http://svn.apache.org/viewvc/lucene/java + * /trunk/src/java/org/apache/lucene/analysis/package.html?revision=718798 + */ +public class CommonGramsQueryFilter extends BufferedTokenStream { + //private CharArraySet commonWords; + private Token prev; + + /** + * Constructor + * + * @param input must be a CommonGramsFilter! + * + */ + + public CommonGramsQueryFilter(CommonGramsFilter input) { + super(input); + prev = new Token(); + } + + public void reset() throws IOException { + super.reset(); + prev = new Token(); + } + + /** + * Output bigrams whenever possible to optimize queries. Only output unigrams + * when they are not a member of a bigram. Example: + *
    + *
  • input: "the rain in spain falls mainly" + *
  • output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly" + */ + + public Token process(Token token) throws IOException { + Token next = peek(1); + /* + * Deal with last token (next=null when current token is the last word) Last + * token will be a unigram. If previous token was a bigram, then we already + * output the last token as part of the unigram and should not additionally + * output the unigram.

    Example: If the end of the input to the + * CommonGramsFilter is "...the plain"

    • current token = "plain"
    • + *
    • next token = null
    • previous token = "the-plain" (bigram)
    • + *
    • Since the word "plain" was already output as part of the bigram we + * don't output it.
    Example: If the end of the input to the + * CommonGramsFilter is "falls mainly"
    • current token = + * "mainly"
    • next token = null
    • previous token = "falls" + * (unigram)
    • Since we haven't yet output the current token, we + * output it
    + */ + + // Deal with special case of last token + if (next == null) { + if (prev == null) { + // This is the first and only token i.e. one word query + return token; + } + if (prev != null && prev.type() != "gram") { + // If previous token was a unigram, output the current token + return token; + } else { + // If previous token was a bigram, we already output it and this token + // was output as part of the bigram so we are done. + return null; + } + } + + /* + * Possible cases are: |token |next 1|word |gram 2|word |word The + * CommonGramsFilter we are wrapping always outputs the unigram word prior + * to outputting an optional bigram: "the sound of" gets output as |"the", + * "the_sound"|"sound", "sound_of" For case 1 we consume the gram from the + * input stream and output it rather than the current token This means that + * the call to super.next() which reads a token from input and passes it on + * to this process method will always get a token of type word + */ + if (next != null && next.type() == "gram") { + // consume "next" token from list and output it + token = read(); + // use this to clone the token because clone requires all these args but + // won't take the token.type + // see + // http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/Token.html + prev.reinit(token.termBuffer(), 0, token.termLength(), token + .startOffset(), token.endOffset(), token.type()); + token.setPositionIncrement(1); + return token; + } + + // if the next token is not a bigram, then output the token + // see note above regarding this method of copying token to prev + prev.reinit(token.termBuffer(), 0, token.termLength(), token.startOffset(), + token.endOffset(), token.type()); + assert token.type() == "word"; + return token; + } +} diff --git a/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java b/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java new file mode 100644 index 00000000000..fbf053f8279 --- /dev/null +++ b/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.analysis; + +import java.io.IOException; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.StopAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.util.plugin.ResourceLoaderAware; + +/** + * Construct CommonGramsQueryFilter + * + * This is pretty close to a straight copy from StopFilterFactory + * + */ +public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory + implements ResourceLoaderAware { + + public void inform(ResourceLoader loader) { + String commonWordFiles = args.get("words"); + ignoreCase = getBoolean("ignoreCase", false); + enablePositionIncrements = getBoolean("enablePositionIncrements", false); + + if (commonWordFiles != null) { + try { + List files = StrUtils.splitFileNames(commonWordFiles); + if (commonWords == null && files.size() > 0) { + // default stopwords list has 35 or so words, but maybe don't make it + // that big to start + commonWords = new CharArraySet(files.size() * 10, ignoreCase); + } + for (String file : files) { + List wlist = loader.getLines(file.trim()); + // TODO: once StopFilter.makeStopSet(List) method is available, switch + // to using that so we can avoid a toArray() call + commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist + .toArray(new String[0]), ignoreCase)); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } else { + commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet( + StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase); + } + } + + // Force the use of a char array set, as it is the most performant, although + // this may break things if Lucene ever goes away from it. See SOLR-1095 + private CharArraySet commonWords; + + private boolean ignoreCase; + + private boolean enablePositionIncrements; + + public boolean isEnablePositionIncrements() { + return enablePositionIncrements; + } + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public Set getCommonWords() { + return commonWords; + } + + /** + * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter + */ + public CommonGramsQueryFilter create(TokenStream input) { + CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, + ignoreCase); + CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter( + commonGrams); + return commonGramsQuery; + } +} diff --git a/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java b/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java new file mode 100644 index 00000000000..1eb46174d0e --- /dev/null +++ b/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java @@ -0,0 +1,69 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.solr.util.AbstractSolrTestCase; +import org.apache.solr.common.ResourceLoader; + +import java.util.Set; +import java.util.Map; +import java.util.HashMap; + +/** + * Tests pretty much copied from StopFilterFactoryTest We use the test files + * used by the StopFilterFactoryTest TODO: consider creating separate test files + * so this won't break if stop filter test files change + **/ +public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase { + public String getSchemaFile() { + return "schema-stop-keep.xml"; + } + + public String getSolrConfigFile() { + return "solrconfig.xml"; + } + + public void testInform() throws Exception { + ResourceLoader loader = solrConfig.getResourceLoader(); + assertTrue("loader is null and it shouldn't be", loader != null); + CommonGramsFilterFactory factory = new CommonGramsFilterFactory(); + Map args = new HashMap(); + args.put("words", "stop-1.txt"); + args.put("ignoreCase", "true"); + factory.init(args); + factory.inform(loader); + Set words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 2, + words.size() == 2); + assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory + .isIgnoreCase() == true); + + factory = new CommonGramsFilterFactory(); + args.put("words", "stop-1.txt, stop-2.txt"); + factory.init(args); + factory.inform(loader); + words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 4, + words.size() == 4); + assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory + .isIgnoreCase() == true); + + } +} diff --git a/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java b/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java new file mode 100644 index 00000000000..1ab2ba7b6f1 --- /dev/null +++ b/src/test/org/apache/solr/analysis/CommonGramsFilterTest.java @@ -0,0 +1,265 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.analysis; + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.StringTokenizer; +import java.util.Map.Entry; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.solr.analysis.TestBufferedTokenStream.AB_AAB_Stream; + +/** + * Tests CommonGramsQueryFilter + */ +public class CommonGramsFilterTest extends TestCase { + private static final String[] commonWords = { "s", "a", "b", "c", "d", "the", + "of" }; + + public void testReset() throws Exception { + final String input = "How the s a brown s cow d like A B thing?"; + WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); + CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); + + TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class); + assertTrue(cgf.incrementToken()); + assertEquals("How", term.term()); + assertTrue(cgf.incrementToken()); + assertEquals("How_the", term.term()); + assertTrue(cgf.incrementToken()); + assertEquals("the", term.term()); + assertTrue(cgf.incrementToken()); + assertEquals("the_s", term.term()); + + wt.reset(new StringReader(input)); + cgf.reset(); + assertTrue(cgf.incrementToken()); + assertEquals("How", term.term()); + } + + public void testCommonGramsQueryFilter() throws Exception { + Set> input2expectedSet = initQueryMap().entrySet(); + for (Iterator> i = input2expectedSet.iterator(); i + .hasNext();) { + Map.Entry me = i.next(); + String input = me.getKey(); + String expected = me.getValue(); + String message = "message: input value is: " + input; + assertEquals(message, expected, testFilter(input, "query")); + } + } + + public void testQueryReset() throws Exception { + final String input = "How the s a brown s cow d like A B thing?"; + WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); + CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); + CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); + + TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class); + assertTrue(nsf.incrementToken()); + assertEquals("How_the", term.term()); + assertTrue(nsf.incrementToken()); + assertEquals("the_s", term.term()); + + wt.reset(new StringReader(input)); + nsf.reset(); + assertTrue(nsf.incrementToken()); + assertEquals("How_the", term.term()); + } + + public void testCommonGramsFilter() throws Exception { + Set> input2expectedSet = initMap().entrySet(); + for (Iterator> i = input2expectedSet.iterator(); i + .hasNext();) { + Map.Entry me = i.next(); + String input = me.getKey(); + String expected = me.getValue(); + String message = "message: input value is: " + input; + assertEquals(message, expected, testFilter(input, "common")); + } + } + + /** + * This is for testing CommonGramsQueryFilter which outputs a set of tokens + * optimized for querying with only one token at each position, either a + * unigram or a bigram It also will not return a token for the final position + * if the final word is already in the preceding bigram Example:(three + * tokens/positions in) + * "foo bar the"=>"foo:1|bar:2,bar-the:2|the:3=> "foo" "bar-the" (2 tokens + * out) + * + * @return Map + */ + private static Map initQueryMap() { + Map input2expected = new LinkedHashMap(); + + // Stop words used below are "of" "the" and "s" + + // two word queries + input2expected.put("brown fox", "/brown/fox"); + input2expected.put("the fox", "/the_fox"); + input2expected.put("fox of", "/fox_of"); + input2expected.put("of the", "/of_the"); + + // one word queries + input2expected.put("the", "/the"); + input2expected.put("foo", "/foo"); + + // 3 word combinations s=stopword/common word n=not a stop word + input2expected.put("n n n", "/n/n/n"); + input2expected.put("quick brown fox", "/quick/brown/fox"); + + input2expected.put("n n s", "/n/n_s"); + input2expected.put("quick brown the", "/quick/brown_the"); + + input2expected.put("n s n", "/n_s/s_n"); + input2expected.put("quick the brown", "/quick_the/the_brown"); + + input2expected.put("n s s", "/n_s/s_s"); + input2expected.put("fox of the", "/fox_of/of_the"); + + input2expected.put("s n n", "/s_n/n/n"); + input2expected.put("the quick brown", "/the_quick/quick/brown"); + + input2expected.put("s n s", "/s_n/n_s"); + input2expected.put("the fox of", "/the_fox/fox_of"); + + input2expected.put("s s n", "/s_s/s_n"); + input2expected.put("of the fox", "/of_the/the_fox"); + + input2expected.put("s s s", "/s_s/s_s"); + input2expected.put("of the of", "/of_the/the_of"); + + return input2expected; + } + + private static Map initMap() { + Map input2expected = new HashMap(); + + // Stop words used below are "of" "the" and "s" + // one word queries + input2expected.put("the", "/the"); + input2expected.put("foo", "/foo"); + + // two word queries + input2expected.put("brown fox", "/brown/fox"); + input2expected.put("the fox", "/the,the_fox/fox"); + input2expected.put("fox of", "/fox,fox_of/of"); + input2expected.put("of the", "/of,of_the/the"); + + // 3 word combinations s=stopword/common word n=not a stop word + input2expected.put("n n n", "/n/n/n"); + input2expected.put("quick brown fox", "/quick/brown/fox"); + + input2expected.put("n n s", "/n/n,n_s/s"); + input2expected.put("quick brown the", "/quick/brown,brown_the/the"); + + input2expected.put("n s n", "/n,n_s/s,s_n/n"); + input2expected.put("quick the fox", "/quick,quick_the/the,the_fox/fox"); + + input2expected.put("n s s", "/n,n_s/s,s_s/s"); + input2expected.put("fox of the", "/fox,fox_of/of,of_the/the"); + + input2expected.put("s n n", "/s,s_n/n/n"); + input2expected.put("the quick brown", "/the,the_quick/quick/brown"); + + input2expected.put("s n s", "/s,s_n/n,n_s/s"); + input2expected.put("the fox of", "/the,the_fox/fox,fox_of/of"); + + input2expected.put("s s n", "/s,s_s/s,s_n/n"); + input2expected.put("of the fox", "/of,of_the/the,the_fox/fox"); + + input2expected.put("s s s", "/s,s_s/s,s_s/s"); + input2expected.put("of the of", "/of,of_the/the,the_of/of"); + + return input2expected; + } + + /* + * Helper methodsCopied and from CDL XTF BigramsStopFilter.java and slightly + * modified to use with CommonGrams http://xtf.wiki.sourceforge.net/ + */ + /** + * Very simple tokenizer that breaks up a string into a series of Lucene + * {@link Token Token}s. + */ + static class StringTokenStream extends TokenStream { + private String str; + + private int prevEnd = 0; + + private StringTokenizer tok; + + private int count = 0; + + public StringTokenStream(String str, String delim) { + this.str = str; + tok = new StringTokenizer(str, delim); + } + + public Token next() { + if (!tok.hasMoreTokens()) + return null; + count++; + String term = tok.nextToken(); + Token t = new Token(term, str.indexOf(term, prevEnd), str.indexOf(term, + prevEnd) + + term.length(), "word"); + prevEnd = t.endOffset(); + return t; + } + } + + public static String testFilter(String in, String type) throws IOException { + TokenStream nsf; + StringTokenStream ts = new StringTokenStream(in, " ."); + if (type.equals("query")) { + CommonGramsFilter cgf = new CommonGramsFilter(ts, commonWords); + nsf = new CommonGramsQueryFilter(cgf); + } else { + nsf = new CommonGramsFilter(ts, commonWords); + } + + StringBuffer outBuf = new StringBuffer(); + while (true) { + Token t = nsf.next(); + if (t == null) + break; + for (int i = 0; i < t.getPositionIncrement(); i++) + outBuf.append('/'); + if (t.getPositionIncrement() == 0) + outBuf.append(','); + outBuf.append(t.term()); + } + + String out = outBuf.toString(); + out = out.replaceAll(" ", ""); + return out; + } +} diff --git a/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java b/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java new file mode 100644 index 00000000000..b6eca9d6b23 --- /dev/null +++ b/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.analysis; + +import org.apache.solr.util.AbstractSolrTestCase; +import org.apache.solr.common.ResourceLoader; + +import java.util.Set; +import java.util.Map; +import java.util.HashMap; + +/** + * Tests pretty much copied from StopFilterFactoryTest We use the test files + * used by the StopFilterFactoryTest TODO: consider creating separate test files + * so this won't break if stop filter test files change + **/ +public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase { + public String getSchemaFile() { + return "schema-stop-keep.xml"; + } + + public String getSolrConfigFile() { + return "solrconfig.xml"; + } + + public void testInform() throws Exception { + ResourceLoader loader = solrConfig.getResourceLoader(); + assertTrue("loader is null and it shouldn't be", loader != null); + CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); + Map args = new HashMap(); + args.put("words", "stop-1.txt"); + args.put("ignoreCase", "true"); + factory.init(args); + factory.inform(loader); + Set words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 2, + words.size() == 2); + assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory + .isIgnoreCase() == true); + + factory = new CommonGramsQueryFilterFactory(); + args.put("words", "stop-1.txt, stop-2.txt"); + factory.init(args); + factory.inform(loader); + words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 4, + words.size() == 4); + assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory + .isIgnoreCase() == true); + + } +}