SOLR-908: add CommonGramsFilterFactory CommonGramsQueryFilterFactory

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@817859 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-09-22 21:55:57 +00:00
parent 9b97e37c14
commit 3743037bc1
8 changed files with 955 additions and 0 deletions

View File

@ -321,6 +321,11 @@ New Features
80. SOLR-1447 : Simple property injection. <mergePolicy> & <mergeSceduler> syntaxes are deprecated ( Jason rutherglen noble)
82. SOLR-908 : CommonGramsFilterFactory/CommonGramsQueryFilterFactory for
speeding up phrase queries containing common words by indexing
n-grams and using them at query time.
(Tom Burton-West, Jason Rutherglen via yonik)
Optimizations
----------------------
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the

View File

@ -0,0 +1,223 @@
/*
* Licensed under the Apache License,
* Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/*
* TODO: Rewrite to use new TokenStream api from lucene 2.9 when BufferedTokenStream uses it.
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and
* associated constructors
*/
/**
* Construct bigrams for frequently occurring terms while indexing. Single terms
* are still indexed too, with bigrams overlaid. This is achieved through the
* use of {@link Token#Set<String>PositionIncrement(int)}. Bigrams have a type
* of "gram" Example
* <ul>
* <li>input:"the quick brown fox"</li>
* <li>output:|"the","the-quick"|"brown"|"fox"|</li>
* <li>"the-quick" has a position increment of 0 so it is in the same position
* as "the" "the-quick" has a term.type() of "gram"</li>
*
* </ul>
*/
/*
* Constructors and makeCommonSet based on similar code in StopFilter
*/
public class CommonGramsFilter extends BufferedTokenStream {
private static final char SEPARATOR = '_';
private final CharArraySet commonWords;
private StringBuilder buffer = new StringBuilder();
/**
* Construct a token stream filtering the given input using a Set of common
* words to create bigrams. Outputs both unigrams with position increment and
* bigrams with position increment 0 type=gram where one or both of the words
* in a potential bigram are in the set of common words .
*
* @param input TokenStream input in filter chain
* @param commonWords The set of common words.
*
*/
public CommonGramsFilter(TokenStream input, Set commonWords) {
this(input, commonWords, false);
}
/**
* Construct a token stream filtering the given input using a Set of common
* words to create bigrams, case-sensitive if ignoreCase is false (unless Set
* is CharArraySet). If <code>commonWords</code> is an instance of
* {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
* construct the set) it will be directly used and <code>ignoreCase</code>
* will be ignored since <code>CharArraySet</code> directly controls case
* sensitivity.
* <p/>
* If <code>commonWords</code> is not an instance of {@link CharArraySet}, a
* new CharArraySet will be constructed and <code>ignoreCase</code> will be
* used to specify the case sensitivity of that set.
*
* @param input TokenStream input in filter chain.
* @param commonWords The set of common words.
* @param ignoreCase -Ignore case when constructing bigrams for common words.
*/
public CommonGramsFilter(TokenStream input, Set commonWords,
boolean ignoreCase) {
super(input);
if (commonWords instanceof CharArraySet) {
this.commonWords = (CharArraySet) commonWords;
} else {
this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
this.commonWords.addAll(commonWords);
}
init();
}
/**
* Construct a token stream filtering the given input using an Array of common
* words to create bigrams.
*
* @param input Tokenstream in filter chain
* @param commonWords words to be used in constructing bigrams
*/
public CommonGramsFilter(TokenStream input, String[] commonWords) {
this(input, commonWords, false);
init();
}
/**
* Construct a token stream filtering the given input using an Array of common
* words to create bigrams and is case-sensitive if ignoreCase is false.
*
* @param input Tokenstream in filter chain
* @param commonWords words to be used in constructing bigrams
* @param ignoreCase -Ignore case when constructing bigrams for common words.
*/
public CommonGramsFilter(TokenStream input, String[] commonWords,
boolean ignoreCase) {
super(input);
this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase);
init();
}
// Here for future moving to 2.9 api See StopFilter code
public void init() {
/**
* termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt
* =(PositionIncrementAttribute)
* addAttribute(PositionIncrementAttribute.class); typeAdd =(TypeAttribute)
* addAttribute(TypeAttribute.class);
*/
}
/**
* Build a CharArraySet from an array of common words, appropriate for passing
* into the CommonGramsFilter constructor. This permits this commonWords
* construction to be cached once when an Analyzer is constructed.
*
* @see #makeCommonSet(java.lang.String[], boolean) passing false to
* ignoreCase
*/
public static final CharArraySet makeCommonSet(String[] commonWords) {
return makeCommonSet(commonWords, false);
}
/**
* Build a CharArraySet from an array of common words, appropriate for passing
* into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
* false.
*
* @param commonWords
* @param ignoreCase If true, all words are lower cased first.
* @return a Set containing the words
*/
public static final CharArraySet makeCommonSet(String[] commonWords,
boolean ignoreCase) {
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
commonSet.addAll(Arrays.asList(commonWords));
return commonSet;
}
/**
* Inserts bigrams for common words into a token stream. For each input token,
* output the token. If the token and/or the following token are in the list
* of common words also output a bigram with position increment 0 and
* type="gram"
*/
/*
* TODO: implement new lucene 2.9 API incrementToken() instead of deprecated
* Token.next() TODO:Consider adding an option to not emit unigram stopwords
* as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
* changed to work with this. TODO: Consider optimizing for the case of three
* commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
* "of-the", "the-year" but with proper management of positions we could
* eliminate the middle bigram "of-the"and save a disk seek and a whole set of
* position lookups.
*/
public Token process(Token token) throws IOException {
Token next = peek(1);
// if this is the last token just spit it out. Any commongram would have
// been output in the previous call
if (next == null) {
return token;
}
/**
* if this token or next are common then construct a bigram with type="gram"
* position increment = 0, and put it in the output queue. It will be
* returned when super.next() is called, before this method gets called with
* a new token from the input stream See implementation of next() in
* BufferedTokenStream
*/
if (isCommon(token) || isCommon(next)) {
Token gram = gramToken(token, next);
write(gram);
}
// we always return the unigram token
return token;
}
/** True if token is for a common term. */
private boolean isCommon(Token token) {
return commonWords != null
&& commonWords.contains(token.termBuffer(), 0, token.termLength());
}
/** Construct a compound token. */
private Token gramToken(Token first, Token second) {
buffer.setLength(0);
buffer.append(first.termText());
buffer.append(SEPARATOR);
buffer.append(second.termText());
Token result = new Token(buffer.toString(), first.startOffset(), second
.endOffset(), "gram");
result.setPositionIncrement(0);
return result;
}
public void reset() throws IOException {
super.reset();
buffer.setLength(0);
}
}

View File

@ -0,0 +1,89 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
* Constructs a CommonGramsFilter
*/
/*
* This is pretty close to a straight copy from StopFilterFactory
*/
public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
public void inform(ResourceLoader loader) {
String commonWordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase", false);
enablePositionIncrements = getBoolean("enablePositionIncrements", false);
if (commonWordFiles != null) {
try {
List<String> files = StrUtils.splitFileNames(commonWordFiles);
if (commonWords == null && files.size() > 0){
//default stopwords list has 35 or so words, but maybe don't make it that big to start
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
}
}
//Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
private CharArraySet commonWords;
private boolean ignoreCase;
private boolean enablePositionIncrements;
public boolean isEnablePositionIncrements() {
return enablePositionIncrements;
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public Set getCommonWords() {
return commonWords;
}
public CommonGramsFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, ignoreCase);
return commonGrams;
}
}

View File

@ -0,0 +1,138 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
/**
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single
* words when they are not a member of a bigram.
*
* Example:
* <ul>
* <li>query input to CommonGramsFilter: "the rain in spain falls mainly"
* <li>output of CommomGramsFilter/input to CommonGramsQueryFilter:
* |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly"
* <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain",
* "falls", "mainly"
* </ul>
*/
/*
* TODO: When org.apache.solr.analysis.BufferedTokenStream is changed to use the
* 2.9 lucene TokenStream api, make necessary changes here.
* See:http://hudson.zones
* .apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache
* /lucene/analysis/TokenStream.html and
* http://svn.apache.org/viewvc/lucene/java
* /trunk/src/java/org/apache/lucene/analysis/package.html?revision=718798
*/
public class CommonGramsQueryFilter extends BufferedTokenStream {
//private CharArraySet commonWords;
private Token prev;
/**
* Constructor
*
* @param input must be a CommonGramsFilter!
*
*/
public CommonGramsQueryFilter(CommonGramsFilter input) {
super(input);
prev = new Token();
}
public void reset() throws IOException {
super.reset();
prev = new Token();
}
/**
* Output bigrams whenever possible to optimize queries. Only output unigrams
* when they are not a member of a bigram. Example:
* <ul>
* <li>input: "the rain in spain falls mainly"
* <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
*/
public Token process(Token token) throws IOException {
Token next = peek(1);
/*
* Deal with last token (next=null when current token is the last word) Last
* token will be a unigram. If previous token was a bigram, then we already
* output the last token as part of the unigram and should not additionally
* output the unigram. <p> Example: If the end of the input to the
* CommonGramsFilter is "...the plain" <ul> <li>current token = "plain"</li>
* <li>next token = null</li> <li>previous token = "the-plain" (bigram)</li>
* <li> Since the word "plain" was already output as part of the bigram we
* don't output it.</li> </ul> Example: If the end of the input to the
* CommonGramsFilter is "falls mainly" <ul> <li>current token =
* "mainly"</li> <li>next token = null</li> <li>previous token = "falls"
* (unigram)</li> <li>Since we haven't yet output the current token, we
* output it</li> </ul>
*/
// Deal with special case of last token
if (next == null) {
if (prev == null) {
// This is the first and only token i.e. one word query
return token;
}
if (prev != null && prev.type() != "gram") {
// If previous token was a unigram, output the current token
return token;
} else {
// If previous token was a bigram, we already output it and this token
// was output as part of the bigram so we are done.
return null;
}
}
/*
* Possible cases are: |token |next 1|word |gram 2|word |word The
* CommonGramsFilter we are wrapping always outputs the unigram word prior
* to outputting an optional bigram: "the sound of" gets output as |"the",
* "the_sound"|"sound", "sound_of" For case 1 we consume the gram from the
* input stream and output it rather than the current token This means that
* the call to super.next() which reads a token from input and passes it on
* to this process method will always get a token of type word
*/
if (next != null && next.type() == "gram") {
// consume "next" token from list and output it
token = read();
// use this to clone the token because clone requires all these args but
// won't take the token.type
// see
// http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/Token.html
prev.reinit(token.termBuffer(), 0, token.termLength(), token
.startOffset(), token.endOffset(), token.type());
token.setPositionIncrement(1);
return token;
}
// if the next token is not a bigram, then output the token
// see note above regarding this method of copying token to prev
prev.reinit(token.termBuffer(), 0, token.termLength(), token.startOffset(),
token.endOffset(), token.type());
assert token.type() == "word";
return token;
}
}

View File

@ -0,0 +1,98 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
* Construct CommonGramsQueryFilter
*
* This is pretty close to a straight copy from StopFilterFactory
*
*/
public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
implements ResourceLoaderAware {
public void inform(ResourceLoader loader) {
String commonWordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase", false);
enablePositionIncrements = getBoolean("enablePositionIncrements", false);
if (commonWordFiles != null) {
try {
List<String> files = StrUtils.splitFileNames(commonWordFiles);
if (commonWords == null && files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it
// that big to start
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
}
for (String file : files) {
List<String> wlist = loader.getLines(file.trim());
// TODO: once StopFilter.makeStopSet(List) method is available, switch
// to using that so we can avoid a toArray() call
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist
.toArray(new String[0]), ignoreCase));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(
StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
}
}
// Force the use of a char array set, as it is the most performant, although
// this may break things if Lucene ever goes away from it. See SOLR-1095
private CharArraySet commonWords;
private boolean ignoreCase;
private boolean enablePositionIncrements;
public boolean isEnablePositionIncrements() {
return enablePositionIncrements;
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public Set getCommonWords() {
return commonWords;
}
/**
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
*/
public CommonGramsQueryFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords,
ignoreCase);
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
commonGrams);
return commonGramsQuery;
}
}

View File

@ -0,0 +1,69 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.common.ResourceLoader;
import java.util.Set;
import java.util.Map;
import java.util.HashMap;
/**
* Tests pretty much copied from StopFilterFactoryTest We use the test files
* used by the StopFilterFactoryTest TODO: consider creating separate test files
* so this won't break if stop filter test files change
**/
public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase {
public String getSchemaFile() {
return "schema-stop-keep.xml";
}
public String getSolrConfigFile() {
return "solrconfig.xml";
}
public void testInform() throws Exception {
ResourceLoader loader = solrConfig.getResourceLoader();
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put("words", "stop-1.txt");
args.put("ignoreCase", "true");
factory.init(args);
factory.inform(loader);
Set words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2,
words.size() == 2);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
.isIgnoreCase() == true);
factory = new CommonGramsFilterFactory();
args.put("words", "stop-1.txt, stop-2.txt");
factory.init(args);
factory.inform(loader);
words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 4,
words.size() == 4);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
.isIgnoreCase() == true);
}
}

View File

@ -0,0 +1,265 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.analysis.TestBufferedTokenStream.AB_AAB_Stream;
/**
* Tests CommonGramsQueryFilter
*/
public class CommonGramsFilterTest extends TestCase {
private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
"of" };
public void testReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class);
assertTrue(cgf.incrementToken());
assertEquals("How", term.term());
assertTrue(cgf.incrementToken());
assertEquals("How_the", term.term());
assertTrue(cgf.incrementToken());
assertEquals("the", term.term());
assertTrue(cgf.incrementToken());
assertEquals("the_s", term.term());
wt.reset(new StringReader(input));
cgf.reset();
assertTrue(cgf.incrementToken());
assertEquals("How", term.term());
}
public void testCommonGramsQueryFilter() throws Exception {
Set<Map.Entry<String, String>> input2expectedSet = initQueryMap().entrySet();
for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
.hasNext();) {
Map.Entry<String, String> me = i.next();
String input = me.getKey();
String expected = me.getValue();
String message = "message: input value is: " + input;
assertEquals(message, expected, testFilter(input, "query"));
}
}
public void testQueryReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class);
assertTrue(nsf.incrementToken());
assertEquals("How_the", term.term());
assertTrue(nsf.incrementToken());
assertEquals("the_s", term.term());
wt.reset(new StringReader(input));
nsf.reset();
assertTrue(nsf.incrementToken());
assertEquals("How_the", term.term());
}
public void testCommonGramsFilter() throws Exception {
Set<Map.Entry<String, String>> input2expectedSet = initMap().entrySet();
for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
.hasNext();) {
Map.Entry<String, String> me = i.next();
String input = me.getKey();
String expected = me.getValue();
String message = "message: input value is: " + input;
assertEquals(message, expected, testFilter(input, "common"));
}
}
/**
* This is for testing CommonGramsQueryFilter which outputs a set of tokens
* optimized for querying with only one token at each position, either a
* unigram or a bigram It also will not return a token for the final position
* if the final word is already in the preceding bigram Example:(three
* tokens/positions in)
* "foo bar the"=>"foo:1|bar:2,bar-the:2|the:3=> "foo" "bar-the" (2 tokens
* out)
*
* @return Map<String,String>
*/
private static Map<String, String> initQueryMap() {
Map<String, String> input2expected = new LinkedHashMap<String, String>();
// Stop words used below are "of" "the" and "s"
// two word queries
input2expected.put("brown fox", "/brown/fox");
input2expected.put("the fox", "/the_fox");
input2expected.put("fox of", "/fox_of");
input2expected.put("of the", "/of_the");
// one word queries
input2expected.put("the", "/the");
input2expected.put("foo", "/foo");
// 3 word combinations s=stopword/common word n=not a stop word
input2expected.put("n n n", "/n/n/n");
input2expected.put("quick brown fox", "/quick/brown/fox");
input2expected.put("n n s", "/n/n_s");
input2expected.put("quick brown the", "/quick/brown_the");
input2expected.put("n s n", "/n_s/s_n");
input2expected.put("quick the brown", "/quick_the/the_brown");
input2expected.put("n s s", "/n_s/s_s");
input2expected.put("fox of the", "/fox_of/of_the");
input2expected.put("s n n", "/s_n/n/n");
input2expected.put("the quick brown", "/the_quick/quick/brown");
input2expected.put("s n s", "/s_n/n_s");
input2expected.put("the fox of", "/the_fox/fox_of");
input2expected.put("s s n", "/s_s/s_n");
input2expected.put("of the fox", "/of_the/the_fox");
input2expected.put("s s s", "/s_s/s_s");
input2expected.put("of the of", "/of_the/the_of");
return input2expected;
}
private static Map<String, String> initMap() {
Map<String, String> input2expected = new HashMap<String, String>();
// Stop words used below are "of" "the" and "s"
// one word queries
input2expected.put("the", "/the");
input2expected.put("foo", "/foo");
// two word queries
input2expected.put("brown fox", "/brown/fox");
input2expected.put("the fox", "/the,the_fox/fox");
input2expected.put("fox of", "/fox,fox_of/of");
input2expected.put("of the", "/of,of_the/the");
// 3 word combinations s=stopword/common word n=not a stop word
input2expected.put("n n n", "/n/n/n");
input2expected.put("quick brown fox", "/quick/brown/fox");
input2expected.put("n n s", "/n/n,n_s/s");
input2expected.put("quick brown the", "/quick/brown,brown_the/the");
input2expected.put("n s n", "/n,n_s/s,s_n/n");
input2expected.put("quick the fox", "/quick,quick_the/the,the_fox/fox");
input2expected.put("n s s", "/n,n_s/s,s_s/s");
input2expected.put("fox of the", "/fox,fox_of/of,of_the/the");
input2expected.put("s n n", "/s,s_n/n/n");
input2expected.put("the quick brown", "/the,the_quick/quick/brown");
input2expected.put("s n s", "/s,s_n/n,n_s/s");
input2expected.put("the fox of", "/the,the_fox/fox,fox_of/of");
input2expected.put("s s n", "/s,s_s/s,s_n/n");
input2expected.put("of the fox", "/of,of_the/the,the_fox/fox");
input2expected.put("s s s", "/s,s_s/s,s_s/s");
input2expected.put("of the of", "/of,of_the/the,the_of/of");
return input2expected;
}
/*
* Helper methodsCopied and from CDL XTF BigramsStopFilter.java and slightly
* modified to use with CommonGrams http://xtf.wiki.sourceforge.net/
*/
/**
* Very simple tokenizer that breaks up a string into a series of Lucene
* {@link Token Token}s.
*/
static class StringTokenStream extends TokenStream {
private String str;
private int prevEnd = 0;
private StringTokenizer tok;
private int count = 0;
public StringTokenStream(String str, String delim) {
this.str = str;
tok = new StringTokenizer(str, delim);
}
public Token next() {
if (!tok.hasMoreTokens())
return null;
count++;
String term = tok.nextToken();
Token t = new Token(term, str.indexOf(term, prevEnd), str.indexOf(term,
prevEnd)
+ term.length(), "word");
prevEnd = t.endOffset();
return t;
}
}
public static String testFilter(String in, String type) throws IOException {
TokenStream nsf;
StringTokenStream ts = new StringTokenStream(in, " .");
if (type.equals("query")) {
CommonGramsFilter cgf = new CommonGramsFilter(ts, commonWords);
nsf = new CommonGramsQueryFilter(cgf);
} else {
nsf = new CommonGramsFilter(ts, commonWords);
}
StringBuffer outBuf = new StringBuffer();
while (true) {
Token t = nsf.next();
if (t == null)
break;
for (int i = 0; i < t.getPositionIncrement(); i++)
outBuf.append('/');
if (t.getPositionIncrement() == 0)
outBuf.append(',');
outBuf.append(t.term());
}
String out = outBuf.toString();
out = out.replaceAll(" ", "");
return out;
}
}

View File

@ -0,0 +1,68 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.common.ResourceLoader;
import java.util.Set;
import java.util.Map;
import java.util.HashMap;
/**
* Tests pretty much copied from StopFilterFactoryTest We use the test files
* used by the StopFilterFactoryTest TODO: consider creating separate test files
* so this won't break if stop filter test files change
**/
public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase {
public String getSchemaFile() {
return "schema-stop-keep.xml";
}
public String getSolrConfigFile() {
return "solrconfig.xml";
}
public void testInform() throws Exception {
ResourceLoader loader = solrConfig.getResourceLoader();
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put("words", "stop-1.txt");
args.put("ignoreCase", "true");
factory.init(args);
factory.inform(loader);
Set words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2,
words.size() == 2);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
.isIgnoreCase() == true);
factory = new CommonGramsQueryFilterFactory();
args.put("words", "stop-1.txt, stop-2.txt");
factory.init(args);
factory.inform(loader);
words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 4,
words.size() == 4);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
.isIgnoreCase() == true);
}
}