mirror of https://github.com/apache/lucene.git
SOLR-908: add CommonGramsFilterFactory CommonGramsQueryFilterFactory
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@817859 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9b97e37c14
commit
3743037bc1
|
@ -321,6 +321,11 @@ New Features
|
|||
|
||||
80. SOLR-1447 : Simple property injection. <mergePolicy> & <mergeSceduler> syntaxes are deprecated ( Jason rutherglen noble)
|
||||
|
||||
82. SOLR-908 : CommonGramsFilterFactory/CommonGramsQueryFilterFactory for
|
||||
speeding up phrase queries containing common words by indexing
|
||||
n-grams and using them at query time.
|
||||
(Tom Burton-West, Jason Rutherglen via yonik)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
|
||||
|
|
|
@ -0,0 +1,223 @@
|
|||
/*
|
||||
* Licensed under the Apache License,
|
||||
* Version 2.0 (the "License"); you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/*
|
||||
* TODO: Rewrite to use new TokenStream api from lucene 2.9 when BufferedTokenStream uses it.
|
||||
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and
|
||||
* associated constructors
|
||||
*/
|
||||
|
||||
/**
|
||||
* Construct bigrams for frequently occurring terms while indexing. Single terms
|
||||
* are still indexed too, with bigrams overlaid. This is achieved through the
|
||||
* use of {@link Token#Set<String>PositionIncrement(int)}. Bigrams have a type
|
||||
* of "gram" Example
|
||||
* <ul>
|
||||
* <li>input:"the quick brown fox"</li>
|
||||
* <li>output:|"the","the-quick"|"brown"|"fox"|</li>
|
||||
* <li>"the-quick" has a position increment of 0 so it is in the same position
|
||||
* as "the" "the-quick" has a term.type() of "gram"</li>
|
||||
*
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
/*
|
||||
* Constructors and makeCommonSet based on similar code in StopFilter
|
||||
*/
|
||||
|
||||
public class CommonGramsFilter extends BufferedTokenStream {
|
||||
|
||||
private static final char SEPARATOR = '_';
|
||||
|
||||
private final CharArraySet commonWords;
|
||||
|
||||
private StringBuilder buffer = new StringBuilder();
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input using a Set of common
|
||||
* words to create bigrams. Outputs both unigrams with position increment and
|
||||
* bigrams with position increment 0 type=gram where one or both of the words
|
||||
* in a potential bigram are in the set of common words .
|
||||
*
|
||||
* @param input TokenStream input in filter chain
|
||||
* @param commonWords The set of common words.
|
||||
*
|
||||
*/
|
||||
public CommonGramsFilter(TokenStream input, Set commonWords) {
|
||||
this(input, commonWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input using a Set of common
|
||||
* words to create bigrams, case-sensitive if ignoreCase is false (unless Set
|
||||
* is CharArraySet). If <code>commonWords</code> is an instance of
|
||||
* {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
|
||||
* construct the set) it will be directly used and <code>ignoreCase</code>
|
||||
* will be ignored since <code>CharArraySet</code> directly controls case
|
||||
* sensitivity.
|
||||
* <p/>
|
||||
* If <code>commonWords</code> is not an instance of {@link CharArraySet}, a
|
||||
* new CharArraySet will be constructed and <code>ignoreCase</code> will be
|
||||
* used to specify the case sensitivity of that set.
|
||||
*
|
||||
* @param input TokenStream input in filter chain.
|
||||
* @param commonWords The set of common words.
|
||||
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
||||
*/
|
||||
public CommonGramsFilter(TokenStream input, Set commonWords,
|
||||
boolean ignoreCase) {
|
||||
super(input);
|
||||
if (commonWords instanceof CharArraySet) {
|
||||
this.commonWords = (CharArraySet) commonWords;
|
||||
} else {
|
||||
this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
|
||||
this.commonWords.addAll(commonWords);
|
||||
}
|
||||
init();
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input using an Array of common
|
||||
* words to create bigrams.
|
||||
*
|
||||
* @param input Tokenstream in filter chain
|
||||
* @param commonWords words to be used in constructing bigrams
|
||||
*/
|
||||
public CommonGramsFilter(TokenStream input, String[] commonWords) {
|
||||
this(input, commonWords, false);
|
||||
init();
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input using an Array of common
|
||||
* words to create bigrams and is case-sensitive if ignoreCase is false.
|
||||
*
|
||||
* @param input Tokenstream in filter chain
|
||||
* @param commonWords words to be used in constructing bigrams
|
||||
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
||||
*/
|
||||
public CommonGramsFilter(TokenStream input, String[] commonWords,
|
||||
boolean ignoreCase) {
|
||||
super(input);
|
||||
this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase);
|
||||
init();
|
||||
}
|
||||
|
||||
// Here for future moving to 2.9 api See StopFilter code
|
||||
|
||||
public void init() {
|
||||
/**
|
||||
* termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt
|
||||
* =(PositionIncrementAttribute)
|
||||
* addAttribute(PositionIncrementAttribute.class); typeAdd =(TypeAttribute)
|
||||
* addAttribute(TypeAttribute.class);
|
||||
*/
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a CharArraySet from an array of common words, appropriate for passing
|
||||
* into the CommonGramsFilter constructor. This permits this commonWords
|
||||
* construction to be cached once when an Analyzer is constructed.
|
||||
*
|
||||
* @see #makeCommonSet(java.lang.String[], boolean) passing false to
|
||||
* ignoreCase
|
||||
*/
|
||||
public static final CharArraySet makeCommonSet(String[] commonWords) {
|
||||
return makeCommonSet(commonWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a CharArraySet from an array of common words, appropriate for passing
|
||||
* into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
|
||||
* false.
|
||||
*
|
||||
* @param commonWords
|
||||
* @param ignoreCase If true, all words are lower cased first.
|
||||
* @return a Set containing the words
|
||||
*/
|
||||
public static final CharArraySet makeCommonSet(String[] commonWords,
|
||||
boolean ignoreCase) {
|
||||
CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
|
||||
commonSet.addAll(Arrays.asList(commonWords));
|
||||
return commonSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts bigrams for common words into a token stream. For each input token,
|
||||
* output the token. If the token and/or the following token are in the list
|
||||
* of common words also output a bigram with position increment 0 and
|
||||
* type="gram"
|
||||
*/
|
||||
/*
|
||||
* TODO: implement new lucene 2.9 API incrementToken() instead of deprecated
|
||||
* Token.next() TODO:Consider adding an option to not emit unigram stopwords
|
||||
* as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
|
||||
* changed to work with this. TODO: Consider optimizing for the case of three
|
||||
* commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
|
||||
* "of-the", "the-year" but with proper management of positions we could
|
||||
* eliminate the middle bigram "of-the"and save a disk seek and a whole set of
|
||||
* position lookups.
|
||||
*/
|
||||
public Token process(Token token) throws IOException {
|
||||
Token next = peek(1);
|
||||
// if this is the last token just spit it out. Any commongram would have
|
||||
// been output in the previous call
|
||||
if (next == null) {
|
||||
return token;
|
||||
}
|
||||
|
||||
/**
|
||||
* if this token or next are common then construct a bigram with type="gram"
|
||||
* position increment = 0, and put it in the output queue. It will be
|
||||
* returned when super.next() is called, before this method gets called with
|
||||
* a new token from the input stream See implementation of next() in
|
||||
* BufferedTokenStream
|
||||
*/
|
||||
|
||||
if (isCommon(token) || isCommon(next)) {
|
||||
Token gram = gramToken(token, next);
|
||||
write(gram);
|
||||
}
|
||||
// we always return the unigram token
|
||||
return token;
|
||||
}
|
||||
|
||||
/** True if token is for a common term. */
|
||||
private boolean isCommon(Token token) {
|
||||
return commonWords != null
|
||||
&& commonWords.contains(token.termBuffer(), 0, token.termLength());
|
||||
}
|
||||
|
||||
/** Construct a compound token. */
|
||||
private Token gramToken(Token first, Token second) {
|
||||
buffer.setLength(0);
|
||||
buffer.append(first.termText());
|
||||
buffer.append(SEPARATOR);
|
||||
buffer.append(second.termText());
|
||||
Token result = new Token(buffer.toString(), first.startOffset(), second
|
||||
.endOffset(), "gram");
|
||||
result.setPositionIncrement(0);
|
||||
return result;
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
buffer.setLength(0);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
/**
|
||||
* Constructs a CommonGramsFilter
|
||||
*/
|
||||
|
||||
/*
|
||||
* This is pretty close to a straight copy from StopFilterFactory
|
||||
*/
|
||||
public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
|
||||
ResourceLoaderAware {
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String commonWordFiles = args.get("words");
|
||||
ignoreCase = getBoolean("ignoreCase", false);
|
||||
enablePositionIncrements = getBoolean("enablePositionIncrements", false);
|
||||
|
||||
if (commonWordFiles != null) {
|
||||
try {
|
||||
List<String> files = StrUtils.splitFileNames(commonWordFiles);
|
||||
if (commonWords == null && files.size() > 0){
|
||||
//default stopwords list has 35 or so words, but maybe don't make it that big to start
|
||||
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
|
||||
}
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
//TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call
|
||||
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
|
||||
}
|
||||
}
|
||||
|
||||
//Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
|
||||
private CharArraySet commonWords;
|
||||
private boolean ignoreCase;
|
||||
private boolean enablePositionIncrements;
|
||||
|
||||
public boolean isEnablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
}
|
||||
|
||||
public boolean isIgnoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
|
||||
public Set getCommonWords() {
|
||||
return commonWords;
|
||||
}
|
||||
|
||||
public CommonGramsFilter create(TokenStream input) {
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords, ignoreCase);
|
||||
return commonGrams;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
/**
|
||||
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single
|
||||
* words when they are not a member of a bigram.
|
||||
*
|
||||
* Example:
|
||||
* <ul>
|
||||
* <li>query input to CommonGramsFilter: "the rain in spain falls mainly"
|
||||
* <li>output of CommomGramsFilter/input to CommonGramsQueryFilter:
|
||||
* |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly"
|
||||
* <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain",
|
||||
* "falls", "mainly"
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
/*
|
||||
* TODO: When org.apache.solr.analysis.BufferedTokenStream is changed to use the
|
||||
* 2.9 lucene TokenStream api, make necessary changes here.
|
||||
* See:http://hudson.zones
|
||||
* .apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache
|
||||
* /lucene/analysis/TokenStream.html and
|
||||
* http://svn.apache.org/viewvc/lucene/java
|
||||
* /trunk/src/java/org/apache/lucene/analysis/package.html?revision=718798
|
||||
*/
|
||||
public class CommonGramsQueryFilter extends BufferedTokenStream {
|
||||
//private CharArraySet commonWords;
|
||||
private Token prev;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param input must be a CommonGramsFilter!
|
||||
*
|
||||
*/
|
||||
|
||||
public CommonGramsQueryFilter(CommonGramsFilter input) {
|
||||
super(input);
|
||||
prev = new Token();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
prev = new Token();
|
||||
}
|
||||
|
||||
/**
|
||||
* Output bigrams whenever possible to optimize queries. Only output unigrams
|
||||
* when they are not a member of a bigram. Example:
|
||||
* <ul>
|
||||
* <li>input: "the rain in spain falls mainly"
|
||||
* <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
|
||||
*/
|
||||
|
||||
public Token process(Token token) throws IOException {
|
||||
Token next = peek(1);
|
||||
/*
|
||||
* Deal with last token (next=null when current token is the last word) Last
|
||||
* token will be a unigram. If previous token was a bigram, then we already
|
||||
* output the last token as part of the unigram and should not additionally
|
||||
* output the unigram. <p> Example: If the end of the input to the
|
||||
* CommonGramsFilter is "...the plain" <ul> <li>current token = "plain"</li>
|
||||
* <li>next token = null</li> <li>previous token = "the-plain" (bigram)</li>
|
||||
* <li> Since the word "plain" was already output as part of the bigram we
|
||||
* don't output it.</li> </ul> Example: If the end of the input to the
|
||||
* CommonGramsFilter is "falls mainly" <ul> <li>current token =
|
||||
* "mainly"</li> <li>next token = null</li> <li>previous token = "falls"
|
||||
* (unigram)</li> <li>Since we haven't yet output the current token, we
|
||||
* output it</li> </ul>
|
||||
*/
|
||||
|
||||
// Deal with special case of last token
|
||||
if (next == null) {
|
||||
if (prev == null) {
|
||||
// This is the first and only token i.e. one word query
|
||||
return token;
|
||||
}
|
||||
if (prev != null && prev.type() != "gram") {
|
||||
// If previous token was a unigram, output the current token
|
||||
return token;
|
||||
} else {
|
||||
// If previous token was a bigram, we already output it and this token
|
||||
// was output as part of the bigram so we are done.
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Possible cases are: |token |next 1|word |gram 2|word |word The
|
||||
* CommonGramsFilter we are wrapping always outputs the unigram word prior
|
||||
* to outputting an optional bigram: "the sound of" gets output as |"the",
|
||||
* "the_sound"|"sound", "sound_of" For case 1 we consume the gram from the
|
||||
* input stream and output it rather than the current token This means that
|
||||
* the call to super.next() which reads a token from input and passes it on
|
||||
* to this process method will always get a token of type word
|
||||
*/
|
||||
if (next != null && next.type() == "gram") {
|
||||
// consume "next" token from list and output it
|
||||
token = read();
|
||||
// use this to clone the token because clone requires all these args but
|
||||
// won't take the token.type
|
||||
// see
|
||||
// http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/Token.html
|
||||
prev.reinit(token.termBuffer(), 0, token.termLength(), token
|
||||
.startOffset(), token.endOffset(), token.type());
|
||||
token.setPositionIncrement(1);
|
||||
return token;
|
||||
}
|
||||
|
||||
// if the next token is not a bigram, then output the token
|
||||
// see note above regarding this method of copying token to prev
|
||||
prev.reinit(token.termBuffer(), 0, token.termLength(), token.startOffset(),
|
||||
token.endOffset(), token.type());
|
||||
assert token.type() == "word";
|
||||
return token;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
/**
|
||||
* Construct CommonGramsQueryFilter
|
||||
*
|
||||
* This is pretty close to a straight copy from StopFilterFactory
|
||||
*
|
||||
*/
|
||||
public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
||||
implements ResourceLoaderAware {
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String commonWordFiles = args.get("words");
|
||||
ignoreCase = getBoolean("ignoreCase", false);
|
||||
enablePositionIncrements = getBoolean("enablePositionIncrements", false);
|
||||
|
||||
if (commonWordFiles != null) {
|
||||
try {
|
||||
List<String> files = StrUtils.splitFileNames(commonWordFiles);
|
||||
if (commonWords == null && files.size() > 0) {
|
||||
// default stopwords list has 35 or so words, but maybe don't make it
|
||||
// that big to start
|
||||
commonWords = new CharArraySet(files.size() * 10, ignoreCase);
|
||||
}
|
||||
for (String file : files) {
|
||||
List<String> wlist = loader.getLines(file.trim());
|
||||
// TODO: once StopFilter.makeStopSet(List) method is available, switch
|
||||
// to using that so we can avoid a toArray() call
|
||||
commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist
|
||||
.toArray(new String[0]), ignoreCase));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(
|
||||
StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
|
||||
}
|
||||
}
|
||||
|
||||
// Force the use of a char array set, as it is the most performant, although
|
||||
// this may break things if Lucene ever goes away from it. See SOLR-1095
|
||||
private CharArraySet commonWords;
|
||||
|
||||
private boolean ignoreCase;
|
||||
|
||||
private boolean enablePositionIncrements;
|
||||
|
||||
public boolean isEnablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
}
|
||||
|
||||
public boolean isIgnoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
|
||||
public Set getCommonWords() {
|
||||
return commonWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
|
||||
*/
|
||||
public CommonGramsQueryFilter create(TokenStream input) {
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords,
|
||||
ignoreCase);
|
||||
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
|
||||
commonGrams);
|
||||
return commonGramsQuery;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Tests pretty much copied from StopFilterFactoryTest We use the test files
|
||||
* used by the StopFilterFactoryTest TODO: consider creating separate test files
|
||||
* so this won't break if stop filter test files change
|
||||
**/
|
||||
public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase {
|
||||
public String getSchemaFile() {
|
||||
return "schema-stop-keep.xml";
|
||||
}
|
||||
|
||||
public String getSolrConfigFile() {
|
||||
return "solrconfig.xml";
|
||||
}
|
||||
|
||||
public void testInform() throws Exception {
|
||||
ResourceLoader loader = solrConfig.getResourceLoader();
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put("words", "stop-1.txt");
|
||||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
||||
words.size() == 2);
|
||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
|
||||
.isIgnoreCase() == true);
|
||||
|
||||
factory = new CommonGramsFilterFactory();
|
||||
args.put("words", "stop-1.txt, stop-2.txt");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 4,
|
||||
words.size() == 4);
|
||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
|
||||
.isIgnoreCase() == true);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,265 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.solr.analysis.TestBufferedTokenStream.AB_AAB_Stream;
|
||||
|
||||
/**
|
||||
* Tests CommonGramsQueryFilter
|
||||
*/
|
||||
public class CommonGramsFilterTest extends TestCase {
|
||||
private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
|
||||
"of" };
|
||||
|
||||
public void testReset() throws Exception {
|
||||
final String input = "How the s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
|
||||
TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class);
|
||||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("How", term.term());
|
||||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("How_the", term.term());
|
||||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("the", term.term());
|
||||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("the_s", term.term());
|
||||
|
||||
wt.reset(new StringReader(input));
|
||||
cgf.reset();
|
||||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("How", term.term());
|
||||
}
|
||||
|
||||
public void testCommonGramsQueryFilter() throws Exception {
|
||||
Set<Map.Entry<String, String>> input2expectedSet = initQueryMap().entrySet();
|
||||
for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
|
||||
.hasNext();) {
|
||||
Map.Entry<String, String> me = i.next();
|
||||
String input = me.getKey();
|
||||
String expected = me.getValue();
|
||||
String message = "message: input value is: " + input;
|
||||
assertEquals(message, expected, testFilter(input, "query"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testQueryReset() throws Exception {
|
||||
final String input = "How the s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
|
||||
TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class);
|
||||
assertTrue(nsf.incrementToken());
|
||||
assertEquals("How_the", term.term());
|
||||
assertTrue(nsf.incrementToken());
|
||||
assertEquals("the_s", term.term());
|
||||
|
||||
wt.reset(new StringReader(input));
|
||||
nsf.reset();
|
||||
assertTrue(nsf.incrementToken());
|
||||
assertEquals("How_the", term.term());
|
||||
}
|
||||
|
||||
public void testCommonGramsFilter() throws Exception {
|
||||
Set<Map.Entry<String, String>> input2expectedSet = initMap().entrySet();
|
||||
for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
|
||||
.hasNext();) {
|
||||
Map.Entry<String, String> me = i.next();
|
||||
String input = me.getKey();
|
||||
String expected = me.getValue();
|
||||
String message = "message: input value is: " + input;
|
||||
assertEquals(message, expected, testFilter(input, "common"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This is for testing CommonGramsQueryFilter which outputs a set of tokens
|
||||
* optimized for querying with only one token at each position, either a
|
||||
* unigram or a bigram It also will not return a token for the final position
|
||||
* if the final word is already in the preceding bigram Example:(three
|
||||
* tokens/positions in)
|
||||
* "foo bar the"=>"foo:1|bar:2,bar-the:2|the:3=> "foo" "bar-the" (2 tokens
|
||||
* out)
|
||||
*
|
||||
* @return Map<String,String>
|
||||
*/
|
||||
private static Map<String, String> initQueryMap() {
|
||||
Map<String, String> input2expected = new LinkedHashMap<String, String>();
|
||||
|
||||
// Stop words used below are "of" "the" and "s"
|
||||
|
||||
// two word queries
|
||||
input2expected.put("brown fox", "/brown/fox");
|
||||
input2expected.put("the fox", "/the_fox");
|
||||
input2expected.put("fox of", "/fox_of");
|
||||
input2expected.put("of the", "/of_the");
|
||||
|
||||
// one word queries
|
||||
input2expected.put("the", "/the");
|
||||
input2expected.put("foo", "/foo");
|
||||
|
||||
// 3 word combinations s=stopword/common word n=not a stop word
|
||||
input2expected.put("n n n", "/n/n/n");
|
||||
input2expected.put("quick brown fox", "/quick/brown/fox");
|
||||
|
||||
input2expected.put("n n s", "/n/n_s");
|
||||
input2expected.put("quick brown the", "/quick/brown_the");
|
||||
|
||||
input2expected.put("n s n", "/n_s/s_n");
|
||||
input2expected.put("quick the brown", "/quick_the/the_brown");
|
||||
|
||||
input2expected.put("n s s", "/n_s/s_s");
|
||||
input2expected.put("fox of the", "/fox_of/of_the");
|
||||
|
||||
input2expected.put("s n n", "/s_n/n/n");
|
||||
input2expected.put("the quick brown", "/the_quick/quick/brown");
|
||||
|
||||
input2expected.put("s n s", "/s_n/n_s");
|
||||
input2expected.put("the fox of", "/the_fox/fox_of");
|
||||
|
||||
input2expected.put("s s n", "/s_s/s_n");
|
||||
input2expected.put("of the fox", "/of_the/the_fox");
|
||||
|
||||
input2expected.put("s s s", "/s_s/s_s");
|
||||
input2expected.put("of the of", "/of_the/the_of");
|
||||
|
||||
return input2expected;
|
||||
}
|
||||
|
||||
private static Map<String, String> initMap() {
|
||||
Map<String, String> input2expected = new HashMap<String, String>();
|
||||
|
||||
// Stop words used below are "of" "the" and "s"
|
||||
// one word queries
|
||||
input2expected.put("the", "/the");
|
||||
input2expected.put("foo", "/foo");
|
||||
|
||||
// two word queries
|
||||
input2expected.put("brown fox", "/brown/fox");
|
||||
input2expected.put("the fox", "/the,the_fox/fox");
|
||||
input2expected.put("fox of", "/fox,fox_of/of");
|
||||
input2expected.put("of the", "/of,of_the/the");
|
||||
|
||||
// 3 word combinations s=stopword/common word n=not a stop word
|
||||
input2expected.put("n n n", "/n/n/n");
|
||||
input2expected.put("quick brown fox", "/quick/brown/fox");
|
||||
|
||||
input2expected.put("n n s", "/n/n,n_s/s");
|
||||
input2expected.put("quick brown the", "/quick/brown,brown_the/the");
|
||||
|
||||
input2expected.put("n s n", "/n,n_s/s,s_n/n");
|
||||
input2expected.put("quick the fox", "/quick,quick_the/the,the_fox/fox");
|
||||
|
||||
input2expected.put("n s s", "/n,n_s/s,s_s/s");
|
||||
input2expected.put("fox of the", "/fox,fox_of/of,of_the/the");
|
||||
|
||||
input2expected.put("s n n", "/s,s_n/n/n");
|
||||
input2expected.put("the quick brown", "/the,the_quick/quick/brown");
|
||||
|
||||
input2expected.put("s n s", "/s,s_n/n,n_s/s");
|
||||
input2expected.put("the fox of", "/the,the_fox/fox,fox_of/of");
|
||||
|
||||
input2expected.put("s s n", "/s,s_s/s,s_n/n");
|
||||
input2expected.put("of the fox", "/of,of_the/the,the_fox/fox");
|
||||
|
||||
input2expected.put("s s s", "/s,s_s/s,s_s/s");
|
||||
input2expected.put("of the of", "/of,of_the/the,the_of/of");
|
||||
|
||||
return input2expected;
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper methodsCopied and from CDL XTF BigramsStopFilter.java and slightly
|
||||
* modified to use with CommonGrams http://xtf.wiki.sourceforge.net/
|
||||
*/
|
||||
/**
|
||||
* Very simple tokenizer that breaks up a string into a series of Lucene
|
||||
* {@link Token Token}s.
|
||||
*/
|
||||
static class StringTokenStream extends TokenStream {
|
||||
private String str;
|
||||
|
||||
private int prevEnd = 0;
|
||||
|
||||
private StringTokenizer tok;
|
||||
|
||||
private int count = 0;
|
||||
|
||||
public StringTokenStream(String str, String delim) {
|
||||
this.str = str;
|
||||
tok = new StringTokenizer(str, delim);
|
||||
}
|
||||
|
||||
public Token next() {
|
||||
if (!tok.hasMoreTokens())
|
||||
return null;
|
||||
count++;
|
||||
String term = tok.nextToken();
|
||||
Token t = new Token(term, str.indexOf(term, prevEnd), str.indexOf(term,
|
||||
prevEnd)
|
||||
+ term.length(), "word");
|
||||
prevEnd = t.endOffset();
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
public static String testFilter(String in, String type) throws IOException {
|
||||
TokenStream nsf;
|
||||
StringTokenStream ts = new StringTokenStream(in, " .");
|
||||
if (type.equals("query")) {
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(ts, commonWords);
|
||||
nsf = new CommonGramsQueryFilter(cgf);
|
||||
} else {
|
||||
nsf = new CommonGramsFilter(ts, commonWords);
|
||||
}
|
||||
|
||||
StringBuffer outBuf = new StringBuffer();
|
||||
while (true) {
|
||||
Token t = nsf.next();
|
||||
if (t == null)
|
||||
break;
|
||||
for (int i = 0; i < t.getPositionIncrement(); i++)
|
||||
outBuf.append('/');
|
||||
if (t.getPositionIncrement() == 0)
|
||||
outBuf.append(',');
|
||||
outBuf.append(t.term());
|
||||
}
|
||||
|
||||
String out = outBuf.toString();
|
||||
out = out.replaceAll(" ", "");
|
||||
return out;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Tests pretty much copied from StopFilterFactoryTest We use the test files
|
||||
* used by the StopFilterFactoryTest TODO: consider creating separate test files
|
||||
* so this won't break if stop filter test files change
|
||||
**/
|
||||
public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase {
|
||||
public String getSchemaFile() {
|
||||
return "schema-stop-keep.xml";
|
||||
}
|
||||
|
||||
public String getSolrConfigFile() {
|
||||
return "solrconfig.xml";
|
||||
}
|
||||
|
||||
public void testInform() throws Exception {
|
||||
ResourceLoader loader = solrConfig.getResourceLoader();
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put("words", "stop-1.txt");
|
||||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
||||
words.size() == 2);
|
||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
|
||||
.isIgnoreCase() == true);
|
||||
|
||||
factory = new CommonGramsQueryFilterFactory();
|
||||
args.put("words", "stop-1.txt, stop-2.txt");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 4,
|
||||
words.size() == 4);
|
||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory
|
||||
.isIgnoreCase() == true);
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue