mirror of https://github.com/apache/lucene.git
SOLR-330: Converted Solr tokenstreams to use Lucene's char[] capabilities
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@643465 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c124044825
commit
e2c2a8d240
|
@ -867,6 +867,7 @@ Optimizations
|
|||
a single token per document (not multiValued & not tokenized) by using the
|
||||
Lucene FieldCache entry for that field to tally term counts. The first request
|
||||
utilizing the FieldCache will take longer than subsequent ones.
|
||||
7. Converted TokenStreams to use Lucene's new char array based capabilities. (SOLR-330, gsingers)
|
||||
|
||||
Bug Fixes
|
||||
1. Fixed delete-by-id for field types who's indexed form is different
|
||||
|
|
|
@ -55,7 +55,7 @@ import java.util.LinkedList;
|
|||
* @version $Id$
|
||||
*/
|
||||
public abstract class BufferedTokenStream extends TokenStream {
|
||||
// in the futute, might be faster if we implemented as an array based CircularQueue
|
||||
// in the future, might be faster if we implemented as an array based CircularQueue
|
||||
private final LinkedList<Token> inQueue = new LinkedList<Token>();
|
||||
private final LinkedList<Token> outQueue = new LinkedList<Token>();
|
||||
private final TokenStream input;
|
||||
|
|
|
@ -17,97 +17,92 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String PROTECTED_TOKENS = "protected";
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String wordFile = args.get("protected");
|
||||
String wordFile = args.get(PROTECTED_TOKENS);
|
||||
if (wordFile != null) {
|
||||
try {
|
||||
List<String> wlist = loader.getLines(wordFile);
|
||||
protectedWords = StopFilter.makeStopSet((String[])wlist.toArray(new String[0]));
|
||||
//This cast is safe in Lucene
|
||||
protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Set protectedWords = null;
|
||||
private CharArraySet protectedWords = null;
|
||||
|
||||
public EnglishPorterFilter create(TokenStream input) {
|
||||
return new EnglishPorterFilter(input,protectedWords);
|
||||
return new EnglishPorterFilter(input, protectedWords);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/** English Porter2 filter that doesn't use reflection to
|
||||
/* adapt lucene to the snowball stemmer code.
|
||||
/**
|
||||
* English Porter2 filter that doesn't use reflection to
|
||||
* adapt lucene to the snowball stemmer code.
|
||||
*/
|
||||
class EnglishPorterFilter extends TokenFilter {
|
||||
private final Set protWords;
|
||||
private final CharArraySet protWords;
|
||||
private net.sf.snowball.ext.EnglishStemmer stemmer;
|
||||
|
||||
public EnglishPorterFilter(TokenStream source, Set protWords) {
|
||||
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
|
||||
super(source);
|
||||
this.protWords=protWords;
|
||||
this.protWords = protWords;
|
||||
stemmer = new net.sf.snowball.ext.EnglishStemmer();
|
||||
}
|
||||
|
||||
|
||||
/** the original code from lucene sandbox
|
||||
public final Token next() throws IOException {
|
||||
Token token = input.next();
|
||||
if (token == null)
|
||||
return null;
|
||||
stemmer.setCurrent(token.termText());
|
||||
try {
|
||||
stemMethod.invoke(stemmer, EMPTY_ARGS);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e.toString());
|
||||
}
|
||||
return new Token(stemmer.getCurrent(),
|
||||
token.startOffset(), token.endOffset(), token.type());
|
||||
}
|
||||
**/
|
||||
/**
|
||||
* the original code from lucene sandbox
|
||||
* public final Token next() throws IOException {
|
||||
* Token token = input.next();
|
||||
* if (token == null)
|
||||
* return null;
|
||||
* stemmer.setCurrent(token.termText());
|
||||
* try {
|
||||
* stemMethod.invoke(stemmer, EMPTY_ARGS);
|
||||
* } catch (Exception e) {
|
||||
* throw new RuntimeException(e.toString());
|
||||
* }
|
||||
* return new Token(stemmer.getCurrent(),
|
||||
* token.startOffset(), token.endOffset(), token.type());
|
||||
* }
|
||||
*/
|
||||
|
||||
@Override
|
||||
public Token next() throws IOException {
|
||||
Token tok = input.next();
|
||||
if (tok==null) return null;
|
||||
String tokstr = tok.termText();
|
||||
|
||||
// if protected, don't stem. use this to avoid stemming collisions.
|
||||
if (protWords != null && protWords.contains(tokstr)) {
|
||||
return tok;
|
||||
public Token next(Token token) throws IOException {
|
||||
Token result = input.next(token);
|
||||
if (result != null) {
|
||||
char[] termBuffer = result.termBuffer();
|
||||
int len = result.termLength();
|
||||
// if protected, don't stem. use this to avoid stemming collisions.
|
||||
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
||||
return result;
|
||||
}
|
||||
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
|
||||
stemmer.stem();
|
||||
String newstr = stemmer.getCurrent();
|
||||
result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
|
||||
}
|
||||
|
||||
stemmer.setCurrent(tokstr);
|
||||
stemmer.stem();
|
||||
String newstr = stemmer.getCurrent();
|
||||
if (tokstr.equals(newstr)) {
|
||||
return tok;
|
||||
} else {
|
||||
// TODO: it would be nice if I could just set termText directly like
|
||||
// lucene packages can.
|
||||
Token newtok = new Token(newstr, tok.startOffset(), tok.endOffset(), tok.type());
|
||||
newtok.setPositionIncrement(tok.getPositionIncrement());
|
||||
return newtok;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -28,25 +28,26 @@ import org.apache.lucene.analysis.*;
|
|||
* This filter should be used on indexing time only.
|
||||
* Example field definition in schema.xml:
|
||||
* <pre>
|
||||
* <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer type="index">
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
||||
* <filter class="solr.HyphenatedWordsFilterFactory"/>
|
||||
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
* </analyzer>
|
||||
* <analyzer type="query">
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
||||
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldtype>
|
||||
* <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer type="index">
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
||||
* <filter class="solr.HyphenatedWordsFilterFactory"/>
|
||||
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
* </analyzer>
|
||||
* <analyzer type="query">
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
||||
* <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldtype>
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public final class HyphenatedWordsFilter extends TokenFilter {
|
||||
|
@ -55,16 +56,18 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
super(in);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
|
||||
/**
|
||||
* @inheritDoc
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public final Token next() throws IOException {
|
||||
StringBuffer termText = new StringBuffer(25);
|
||||
public final Token next(Token in) throws IOException {
|
||||
StringBuilder termText = new StringBuilder(25);
|
||||
int startOffset = -1, firstPositionIncrement = -1, wordsMerged = 0;
|
||||
Token lastToken = null;
|
||||
for (Token token = input.next(); token != null; token = input.next()) {
|
||||
termText.append(token.termText());
|
||||
for (Token token = input.next(in); token != null; token = input.next()) {
|
||||
termText.append(token.termBuffer(), 0, token.termLength());
|
||||
//current token ends with hyphen -> grab the next token and glue them together
|
||||
if (termText.charAt(termText.length() - 1) == '-') {
|
||||
wordsMerged++;
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
@ -32,23 +33,18 @@ import java.util.Set;
|
|||
* @since solr 1.3
|
||||
*/
|
||||
public final class KeepWordFilter extends TokenFilter {
|
||||
final Set<String> words;
|
||||
final boolean ignoreCase;
|
||||
final CharArraySet words;
|
||||
|
||||
|
||||
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
|
||||
super(in);
|
||||
this.words=words;
|
||||
this.ignoreCase=ignoreCase;
|
||||
this.words = new CharArraySet(words, ignoreCase);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Token next() throws IOException {
|
||||
for (Token token=input.next(); token!=null; token=input.next()) {
|
||||
String txt = ignoreCase
|
||||
? token.termText().toLowerCase()
|
||||
: token.termText();
|
||||
|
||||
if( words.contains( txt ) ) {
|
||||
public final Token next(Token in) throws IOException {
|
||||
for (Token token=input.next(in); token!=null; token=input.next()) {
|
||||
if( words.contains( token.termBuffer(), 0, token.termLength() ) ) {
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,8 +36,8 @@ public final class LengthFilter extends TokenFilter {
|
|||
//System.out.println("min="+min+" max="+max);
|
||||
}
|
||||
|
||||
public final Token next() throws IOException {
|
||||
for (Token token=input.next(); token!=null; token=input.next()) {
|
||||
public final Token next(Token in) throws IOException {
|
||||
for (Token token=input.next(in); token!=null; token=input.next(in)) {
|
||||
final int len = token.endOffset() - token.startOffset();
|
||||
if (len<min || len>max) continue;
|
||||
return token;
|
||||
|
|
|
@ -27,12 +27,14 @@ import java.util.Map;
|
|||
*/
|
||||
public class LengthFilterFactory extends BaseTokenFilterFactory {
|
||||
int min,max;
|
||||
public static final String MIN_KEY = "min";
|
||||
public static final String MAX_KEY = "max";
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
min=Integer.parseInt(args.get("min"));
|
||||
max=Integer.parseInt(args.get("max"));
|
||||
min=Integer.parseInt(args.get(MIN_KEY));
|
||||
max=Integer.parseInt(args.get(MAX_KEY));
|
||||
}
|
||||
public LengthFilter create(TokenStream input) {
|
||||
return new LengthFilter(input,min,max);
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Token;
|
|||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.IOException;
|
||||
import java.nio.CharBuffer;
|
||||
|
||||
/**
|
||||
* A TokenFilter which applies a Pattern to each token in the stream,
|
||||
|
@ -64,12 +65,12 @@ public final class PatternReplaceFilter extends TokenFilter {
|
|||
this.all=all;
|
||||
}
|
||||
|
||||
public final Token next() throws IOException {
|
||||
Token t = input.next();
|
||||
public final Token next(Token in) throws IOException {
|
||||
Token t = input.next(in);
|
||||
if (t == null)
|
||||
return null;
|
||||
|
||||
Matcher m = p.matcher(t.termText());
|
||||
CharSequence text = CharBuffer.wrap(t.termBuffer(), 0, t.termLength());
|
||||
Matcher m = p.matcher(text);
|
||||
if (all) {
|
||||
t.setTermText(m.replaceAll(replacement));
|
||||
} else {
|
||||
|
|
|
@ -46,29 +46,27 @@ public class PhoneticFilter extends TokenFilter
|
|||
}
|
||||
|
||||
@Override
|
||||
public final Token next() throws IOException {
|
||||
public final Token next(Token in) throws IOException {
|
||||
if( save != null ) {
|
||||
Token temp = save;
|
||||
save = null;
|
||||
return temp;
|
||||
}
|
||||
|
||||
Token t = input.next();
|
||||
Token t = input.next(in);
|
||||
if( t != null ) {
|
||||
String value = t.termText();
|
||||
String value = new String(t.termBuffer(), 0, t.termLength());
|
||||
try {
|
||||
value = encoder.encode(t.termText()).toString();
|
||||
value = encoder.encode(value).toString();
|
||||
}
|
||||
catch (Exception ignored) {} // just use the direct text
|
||||
|
||||
Token m = new Token(value, t.startOffset(), t.endOffset(), name );
|
||||
//Token m = new Token(value, t.startOffset(), t.endOffset(), name );
|
||||
if( inject ) {
|
||||
m.setPositionIncrement(0);
|
||||
save = m;
|
||||
}
|
||||
else {
|
||||
// replace the token rather then add it too the stream
|
||||
return m;
|
||||
save = (Token) t.clone();
|
||||
save.setPositionIncrement(0);
|
||||
save.setTermBuffer(value.toCharArray(), 0, value.length());
|
||||
} else {
|
||||
t.setTermBuffer(value.toCharArray(), 0, value.length());
|
||||
}
|
||||
}
|
||||
return t;
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.util.ArraysUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -30,23 +31,27 @@ public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
|
|||
public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
|
||||
protected Token process(Token t) throws IOException {
|
||||
Token tok = read();
|
||||
OUT: while (tok != null && tok.getPositionIncrement()==0) {
|
||||
while (tok != null && tok.getPositionIncrement()==0) {
|
||||
if (null != t) {
|
||||
write(t);
|
||||
t = null;
|
||||
}
|
||||
boolean dup=false;
|
||||
IN: for (Token outTok : output()) {
|
||||
if (outTok.termText().equals(tok.termText())) {
|
||||
for (Token outTok : output()) {
|
||||
int tokLen = tok.termLength();
|
||||
if (outTok.termLength() == tokLen && ArraysUtils.equals(outTok.termBuffer(), 0, tok.termBuffer(), 0, tokLen)) {
|
||||
dup=true;
|
||||
break IN;
|
||||
//continue;;
|
||||
}
|
||||
}
|
||||
if (!dup)
|
||||
if (!dup){
|
||||
write(tok);
|
||||
}
|
||||
tok = read();
|
||||
}
|
||||
if (tok != null) pushBack(tok);
|
||||
if (tok != null) {
|
||||
pushBack(tok);
|
||||
}
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,9 +17,9 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -32,47 +32,51 @@ public final class TrimFilter extends TokenFilter {
|
|||
|
||||
final boolean updateOffsets;
|
||||
|
||||
public TrimFilter(TokenStream in, boolean updateOffsets ) {
|
||||
public TrimFilter(TokenStream in, boolean updateOffsets) {
|
||||
super(in);
|
||||
this.updateOffsets = updateOffsets;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Token next() throws IOException {
|
||||
Token t = input.next();
|
||||
if (null == t || null == t.termText())
|
||||
public final Token next(Token in) throws IOException {
|
||||
Token t = input.next(in);
|
||||
if (null == t || null == t.termBuffer() || t.termLength() == 0){
|
||||
return t;
|
||||
|
||||
if( updateOffsets ) {
|
||||
String txt = t.termText();
|
||||
int start = 0;
|
||||
int end = txt.length();
|
||||
int endOff = 0;
|
||||
|
||||
// eat the first characters
|
||||
while ((start < end) && (txt.charAt(start) <= ' ')) {
|
||||
start++;
|
||||
}
|
||||
|
||||
// eat the end characters
|
||||
while ((start < end) && (txt.charAt(end-1) <= ' ')) {
|
||||
end--;
|
||||
endOff++;
|
||||
}
|
||||
|
||||
if( start > 0 || end < txt.length() ) {
|
||||
int incr = t.getPositionIncrement();
|
||||
t = new Token( t.termText().substring( start, end ),
|
||||
t.startOffset()+start,
|
||||
t.endOffset()-endOff,
|
||||
t.type() );
|
||||
|
||||
t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset
|
||||
}
|
||||
}
|
||||
else {
|
||||
t.setTermText( t.termText().trim() );
|
||||
char[] termBuffer = t.termBuffer();
|
||||
int len = t.termLength();
|
||||
int start = 0;
|
||||
int end = 0;
|
||||
int endOff = 0;
|
||||
|
||||
// eat the first characters
|
||||
//QUESTION: Should we use Character.isWhitespace() instead?
|
||||
for (start = 0; start < len && termBuffer[start] <= ' '; start++) {
|
||||
}
|
||||
// eat the end characters
|
||||
for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) {
|
||||
endOff++;
|
||||
}
|
||||
if (start > 0 || end < len) {
|
||||
if (start < end) {
|
||||
t.setTermBuffer(t.termBuffer(), start, (end - start));
|
||||
} else {
|
||||
t.setTermLength(0);
|
||||
}
|
||||
if (updateOffsets) {
|
||||
t.setStartOffset(t.startOffset() + start);
|
||||
if (start < end) {
|
||||
t.setEndOffset(t.endOffset() - endOff);
|
||||
} //else if end is less than, start, then the term length is 0, so, no need to bother w/ the end offset
|
||||
}
|
||||
/*t = new Token( t.termText().substring( start, end ),
|
||||
t.startOffset()+start,
|
||||
t.endOffset()-endOff,
|
||||
t.type() );*/
|
||||
|
||||
|
||||
}
|
||||
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -192,7 +192,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
// use the type of the first char as the type
|
||||
// of the token.
|
||||
private int tokType(Token t) {
|
||||
return charType(t.termText().charAt(0));
|
||||
return charType(t.termBuffer()[0]);
|
||||
}
|
||||
|
||||
// There isn't really an efficient queue class, so we will
|
||||
|
@ -207,23 +207,22 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
private Token newTok(Token orig, int start, int end) {
|
||||
int startOff = orig.startOffset();
|
||||
int endOff = orig.endOffset();
|
||||
String origStr = orig.termText();
|
||||
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
if (origStr.length() == endOff-startOff) {
|
||||
if (orig.termLength() == endOff-startOff) {
|
||||
endOff = startOff + end;
|
||||
startOff += start;
|
||||
}
|
||||
|
||||
return new Token(orig.termText().substring(start,end),
|
||||
startOff,
|
||||
Token newTok = new Token(startOff,
|
||||
endOff,
|
||||
orig.type());
|
||||
newTok.setTermBuffer(orig.termBuffer(), start, (end - start));
|
||||
return newTok;
|
||||
}
|
||||
|
||||
|
||||
public final Token next() throws IOException {
|
||||
public final Token next(Token in) throws IOException {
|
||||
|
||||
// check the queue first
|
||||
if (queuePos<queue.size()) {
|
||||
|
@ -248,25 +247,25 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
Token t = input.next();
|
||||
if (t == null) return null;
|
||||
|
||||
String s = t.termText();
|
||||
char [] termBuffer = t.termBuffer();
|
||||
int len = t.termLength();
|
||||
int start=0;
|
||||
int end=s.length();
|
||||
if (end==0) continue;
|
||||
if (len ==0) continue;
|
||||
|
||||
origPosIncrement = t.getPositionIncrement();
|
||||
|
||||
// Avoid calling charType more than once for each char (basically
|
||||
// avoid any backtracking).
|
||||
// makes code slightly more difficult, but faster.
|
||||
int ch=s.charAt(start);
|
||||
int ch=termBuffer[start];
|
||||
int type=charType(ch);
|
||||
|
||||
int numWords=0;
|
||||
|
||||
while (start<end) {
|
||||
while (start< len) {
|
||||
// first eat delimiters at the start of this subword
|
||||
while ((type & SUBWORD_DELIM)!=0 && ++start<end) {
|
||||
ch=s.charAt(start);
|
||||
while ((type & SUBWORD_DELIM)!=0 && ++start< len) {
|
||||
ch=termBuffer[start];
|
||||
type=charType(ch);
|
||||
}
|
||||
|
||||
|
@ -278,23 +277,23 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
int lastType=type; // type of the previously read char
|
||||
|
||||
|
||||
while (pos<end) {
|
||||
while (pos< len) {
|
||||
|
||||
if (type!=lastType) {
|
||||
// check and remove "'s" from the end of a token.
|
||||
// the pattern to check for is
|
||||
// ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
|
||||
if ((lastType & ALPHA)!=0) {
|
||||
if (ch=='\'' && pos+1<end
|
||||
&& (s.charAt(pos+1)=='s' || s.charAt(pos+1)=='S'))
|
||||
if (ch=='\'' && pos+1< len
|
||||
&& (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
|
||||
{
|
||||
int subWordEnd=pos;
|
||||
if (pos+2>=end) {
|
||||
if (pos+2>= len) {
|
||||
// end of string detected after "'s"
|
||||
pos+=2;
|
||||
} else {
|
||||
// make sure that a delimiter follows "'s"
|
||||
int ch2 = s.charAt(pos+2);
|
||||
int ch2 = termBuffer[pos+2];
|
||||
int type2 = charType(ch2);
|
||||
if ((type2 & SUBWORD_DELIM)!=0) {
|
||||
// if delimiter, move position pointer
|
||||
|
@ -340,7 +339,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
if (++pos >= end) {
|
||||
if (++pos >= len) {
|
||||
if (start==0) {
|
||||
// the subword is the whole original token, so
|
||||
// return it unchanged.
|
||||
|
@ -362,7 +361,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
lastType = type;
|
||||
ch = s.charAt(pos);
|
||||
ch = termBuffer[pos];
|
||||
type = charType(ch);
|
||||
}
|
||||
|
||||
|
@ -482,7 +481,7 @@ final class WordDelimiterFilter extends TokenFilter {
|
|||
tok = lst.get(i);
|
||||
if (catenateSubwords) {
|
||||
if (i==start) firstTok=tok;
|
||||
sb.append(tok.termText());
|
||||
sb.append(tok.termBuffer(), 0, tok.termLength());
|
||||
}
|
||||
if (generateSubwords) {
|
||||
queue.add(tok);
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
package org.apache.solr.util;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
//Since Arrays.equals doesn't implement offsets for equals
|
||||
public class ArraysUtils {
|
||||
|
||||
/**
|
||||
* See if two array slices are the same.
|
||||
*
|
||||
* @param left The left array to compare
|
||||
* @param offsetLeft The offset into the array. Must be positive
|
||||
* @param right The right array to compare
|
||||
* @param offsetRight the offset into the right array. Must be positive
|
||||
* @param length The length of the section of the array to compare
|
||||
* @return true if the two arrays, starting at their respective offsets, are equal
|
||||
*
|
||||
* @see java.util.Arrays#equals(char[], char[])
|
||||
*/
|
||||
public static boolean equals(char[] left, int offsetLeft, char[] right, int offsetRight, int length) {
|
||||
if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) {
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (left[offsetLeft + i] != right[offsetRight + i]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,96 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import net.sf.snowball.ext.EnglishStemmer;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
||||
public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
StringBuilder gold = new StringBuilder();
|
||||
for (int i = 0; i < test.length; i++) {
|
||||
stemmer.setCurrent(test[i]);
|
||||
stemmer.stem();
|
||||
gold.append(stemmer.getCurrent()).append(' ');
|
||||
}
|
||||
|
||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
|
||||
factory.init(args);
|
||||
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
||||
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||
assertEquals(gold.toString().trim(), out);
|
||||
}
|
||||
|
||||
public void testProtected() throws Exception {
|
||||
EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
StringBuilder gold = new StringBuilder();
|
||||
for (int i = 0; i < test.length; i++) {
|
||||
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
|
||||
stemmer.setCurrent(test[i]);
|
||||
stemmer.stem();
|
||||
gold.append(stemmer.getCurrent()).append(' ');
|
||||
} else {
|
||||
gold.append(test[i]).append(' ');
|
||||
}
|
||||
}
|
||||
|
||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put(EnglishPorterFilterFactory.PROTECTED_TOKENS, "who-cares.txt");
|
||||
factory.init(args);
|
||||
List<String> lines = new ArrayList<String>();
|
||||
Collections.addAll(lines, "banks", "fledgling");
|
||||
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||
assertEquals(gold.toString().trim(), out);
|
||||
}
|
||||
|
||||
class LinesMockSolrResourceLoader implements ResourceLoader {
|
||||
List<String> lines;
|
||||
|
||||
LinesMockSolrResourceLoader(List<String> lines) {
|
||||
this.lines = lines;
|
||||
}
|
||||
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return lines;
|
||||
}
|
||||
|
||||
public Object newInstance(String cname, String... subpackages) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public InputStream openResource(String resource) throws IOException {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class LengthFilterTest extends BaseTokenTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
LengthFilterFactory factory = new LengthFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
|
||||
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
|
||||
factory.init(args);
|
||||
String[] test = {"foo", "foobar", "super-duper-trooper"};
|
||||
String gold = "foobar";
|
||||
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||
assertEquals(gold.toString(), out);
|
||||
}
|
||||
}
|
|
@ -27,8 +27,8 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
|||
*/
|
||||
public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
||||
public void testHyphenatedWords() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on";
|
||||
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on";
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
|
||||
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on and ecological";
|
||||
// first test
|
||||
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
|
|
|
@ -17,13 +17,13 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.regex.Pattern;
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
*/
|
||||
|
@ -32,61 +32,81 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
|
|||
public void testReplaceAll() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
"-", true);
|
||||
assertEquals("-foo-foo-foo-", ts.next().termText());
|
||||
assertEquals("-", ts.next().termText());
|
||||
assertEquals("c-", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
"-", true);
|
||||
Token token = ts.next();
|
||||
assertEquals("-foo-foo-foo-", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
}
|
||||
|
||||
public void testReplaceFirst() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
"-", false);
|
||||
assertEquals("-fooaabfooabfoob", ts.next().termText());
|
||||
assertEquals("-", ts.next().termText());
|
||||
assertEquals("c-", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
"-", false);
|
||||
Token token = ts.next();
|
||||
assertEquals("-fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
}
|
||||
|
||||
public void testStripFirst() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
null, false);
|
||||
assertEquals("fooaabfooabfoob", ts.next().termText());
|
||||
assertEquals("", ts.next().termText());
|
||||
assertEquals("c", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
null, false);
|
||||
Token token = ts.next();
|
||||
assertEquals("fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
}
|
||||
|
||||
public void testStripAll() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
null, true);
|
||||
assertEquals("foofoofoo", ts.next().termText());
|
||||
assertEquals("", ts.next().termText());
|
||||
assertEquals("c", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
null, true);
|
||||
Token token = ts.next();
|
||||
assertEquals("foofoofoo", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
}
|
||||
|
||||
public void testReplaceAllWithBackRef() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("(a*)b"),
|
||||
"$1\\$", true);
|
||||
assertEquals("aa$fooaa$fooa$foo$", ts.next().termText());
|
||||
assertEquals("a$", ts.next().termText());
|
||||
assertEquals("caaaaaaaaa$", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("(a*)b"),
|
||||
"$1\\$", true);
|
||||
Token token = ts.next();
|
||||
assertEquals("aa$fooaa$fooa$foo$", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("a$", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("caaaaaaaaa$", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -81,8 +81,8 @@ public class TestPhoneticFilter extends BaseTokenTestCase {
|
|||
new IterTokenStream(stream.iterator()), enc, "text", inject );
|
||||
|
||||
for( Token t : output ) {
|
||||
Token got = filter.next();
|
||||
assertEquals( t.termText(), got.termText());
|
||||
Token got = filter.next(t);
|
||||
assertEquals( new String(t.termBuffer(), 0, t.termLength()), new String(got.termBuffer(), 0, got.termLength()));
|
||||
}
|
||||
assertNull( filter.next() ); // no more tokens
|
||||
}
|
||||
|
|
|
@ -35,11 +35,16 @@ public class TestTrimFilter extends BaseTokenTestCase {
|
|||
new Token("cCc",11,15),
|
||||
new Token(" ",16,20)), false );
|
||||
|
||||
assertEquals("a", ts.next().termText());
|
||||
assertEquals("b", ts.next().termText());
|
||||
assertEquals("cCc", ts.next().termText());
|
||||
assertEquals("", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
Token token = ts.next();
|
||||
assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
|
||||
ts = new TrimFilter( new IterTokenStream(
|
||||
new Token(" a", 0,2),
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
package org.apache.solr.util;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class ArraysUtilsTest extends TestCase {
|
||||
|
||||
|
||||
public ArraysUtilsTest(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() {
|
||||
String left = "this is equal";
|
||||
String right = left;
|
||||
char[] leftChars = left.toCharArray();
|
||||
char[] rightChars = right.toCharArray();
|
||||
assertTrue(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 0, rightChars, 0, left.length()));
|
||||
|
||||
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 0, left.length()));
|
||||
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 2, left.length()));
|
||||
|
||||
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 25, rightChars, 0, left.length()));
|
||||
assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 12, rightChars, 0, left.length()));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue