SOLR-1353: Implement and use reusable token streams for analysis

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@804726 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-08-16 17:28:58 +00:00
parent b45440d7c7
commit bb76127225
24 changed files with 377 additions and 248 deletions

View File

@ -320,6 +320,9 @@ Optimizations
15. SOLR-1150: Load Documents for Highlighting one at a time rather than 15. SOLR-1150: Load Documents for Highlighting one at a time rather than
all at once to avoid OOM with many large Documents. (Siddharth Gargate via Mark Miller) all at once to avoid OOM with many large Documents. (Siddharth Gargate via Mark Miller)
16. SOLR-1353: Implement and use reusable token streams for analysis. (yonik)
Bug Fixes Bug Fixes
---------------------- ----------------------
1. SOLR-774: Fixed logging level display (Sean Timm via Otis Gospodnetic) 1. SOLR-774: Fixed logging level display (Sean Timm via Otis Gospodnetic)

View File

@ -17,10 +17,8 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -190,52 +188,53 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
* This is package protected since it is not useful without the Factory * This is package protected since it is not useful without the Factory
*/ */
class CapitalizationFilter extends TokenFilter { class CapitalizationFilter extends TokenFilter {
protected final CapitalizationFilterFactory factory; private final CapitalizationFilterFactory factory;
private final TermAttribute termAtt;
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) { public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
super(in); super(in);
this.factory = factory; this.factory = factory;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
} }
@Override @Override
public Token next(Token token) throws IOException { public boolean incrementToken() throws IOException {
Token t = input.next(token); if (!input.incrementToken()) return false;
if (t != null) {
char[] termBuffer = t.termBuffer(); char[] termBuffer = termAtt.termBuffer();
int termBufferLength = t.termLength(); int termBufferLength = termAtt.termLength();
char[] backup = null; char[] backup = null;
if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) { if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) {
//make a backup in case we exceed the word count //make a backup in case we exceed the word count
System.arraycopy(termBuffer, 0, backup, 0, termBufferLength); System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
} }
if (termBufferLength < factory.maxTokenLength) { if (termBufferLength < factory.maxTokenLength) {
int wordCount = 0; int wordCount = 0;
int lastWordStart = 0; int lastWordStart = 0;
for (int i = 0; i < termBufferLength; i++) { for (int i = 0; i < termBufferLength; i++) {
char c = termBuffer[i]; char c = termBuffer[i];
if (c <= ' ' || c == '.') { if (c <= ' ' || c == '.') {
int len = i - lastWordStart; int len = i - lastWordStart;
if (len > 0) { if (len > 0) {
factory.processWord(termBuffer, lastWordStart, len, wordCount++); factory.processWord(termBuffer, lastWordStart, len, wordCount++);
lastWordStart = i + 1; lastWordStart = i + 1;
i++; i++;
}
} }
} }
}
// process the last word // process the last word
if (lastWordStart < termBufferLength) { if (lastWordStart < termBufferLength) {
factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
} }
if (wordCount > factory.maxWordCount) { if (wordCount > factory.maxWordCount) {
t.setTermBuffer(backup, 0, termBufferLength); termAtt.setTermBuffer(backup, 0, termBufferLength);
}
} }
} }
return t;
return true;
} }
} }

View File

@ -50,7 +50,8 @@ public class DoubleMetaphoneFilter extends TokenFilter {
for(;;) { for(;;) {
if (!remainingTokens.isEmpty()) { if (!remainingTokens.isEmpty()) {
clearAttributes(); restoreState(remainingTokens.removeFirst()); // clearAttributes(); // not currently necessary
restoreState(remainingTokens.removeFirst());
return true; return true;
} }

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils; import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.tartarus.snowball.SnowballProgram;
import java.io.IOException; import java.io.IOException;
import java.io.File; import java.io.File;
@ -75,50 +76,9 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
* English Porter2 filter that doesn't use reflection to * English Porter2 filter that doesn't use reflection to
* adapt lucene to the snowball stemmer code. * adapt lucene to the snowball stemmer code.
*/ */
class EnglishPorterFilter extends TokenFilter { @Deprecated
private final CharArraySet protWords; class EnglishPorterFilter extends SnowballPorterFilter {
private org.tartarus.snowball.ext.EnglishStemmer stemmer;
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) { public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
super(source); super(source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords);
this.protWords = protWords;
stemmer = new org.tartarus.snowball.ext.EnglishStemmer();
}
/**
* the original code from lucene sandbox
* public final Token next() throws IOException {
* Token token = input.next();
* if (token == null)
* return null;
* stemmer.setCurrent(token.termText());
* try {
* stemMethod.invoke(stemmer, EMPTY_ARGS);
* } catch (Exception e) {
* throw new RuntimeException(e.toString());
* }
* return new Token(stemmer.getCurrent(),
* token.startOffset(), token.endOffset(), token.type());
* }
*/
@Override
public Token next(Token token) throws IOException {
Token result = input.next(token);
if (result != null) {
char[] termBuffer = result.termBuffer();
int len = result.termLength();
// if protected, don't stem. use this to avoid stemming collisions.
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
return result;
}
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
stemmer.stem();
String newstr = stemmer.getCurrent();
result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
}
return result;
} }
} }

View File

@ -21,6 +21,8 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.tartarus.snowball.SnowballProgram;
import java.io.IOException; import java.io.IOException;
import java.util.Set; import java.util.Set;
@ -33,21 +35,20 @@ import java.util.Set;
* @since solr 1.3 * @since solr 1.3
*/ */
public final class KeepWordFilter extends TokenFilter { public final class KeepWordFilter extends TokenFilter {
final CharArraySet words; private final CharArraySet words;
private final TermAttribute termAtt;
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) { public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
super(in); super(in);
this.words = new CharArraySet(words, ignoreCase); this.words = new CharArraySet(words, ignoreCase);
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
} }
@Override @Override
public final Token next(Token in) throws IOException { public boolean incrementToken() throws IOException {
for (Token token=input.next(in); token!=null; token=input.next(token)) { while (input.incrementToken()) {
if( words.contains( token.termBuffer(), 0, token.termLength() ) ) { if (words.contains(termAtt.termBuffer(), 0, termAtt.termLength())) return true;
return token;
}
} }
return null; return false;
} }
} }

View File

@ -20,9 +20,12 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.Set;
import java.io.IOException; import java.io.IOException;
import java.nio.CharBuffer; import java.nio.CharBuffer;
@ -40,10 +43,10 @@ import java.nio.CharBuffer;
* @see Pattern * @see Pattern
*/ */
public final class PatternReplaceFilter extends TokenFilter { public final class PatternReplaceFilter extends TokenFilter {
Pattern p; private final Pattern p;
String replacement; private final String replacement;
boolean all = true; private final boolean all;
private final TermAttribute termAtt;
/** /**
* Constructs an instance to replace either the first, or all occurances * Constructs an instance to replace either the first, or all occurances
* *
@ -63,21 +66,23 @@ public final class PatternReplaceFilter extends TokenFilter {
this.p=p; this.p=p;
this.replacement = (null == replacement) ? "" : replacement; this.replacement = (null == replacement) ? "" : replacement;
this.all=all; this.all=all;
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
} }
public final Token next(Token in) throws IOException { @Override
Token t = input.next(in); public boolean incrementToken() throws IOException {
if (t == null) if (!input.incrementToken()) return false;
return null;
CharSequence text = CharBuffer.wrap(t.termBuffer(), 0, t.termLength()); CharSequence text = CharBuffer.wrap(termAtt.termBuffer(), 0, termAtt.termLength());
Matcher m = p.matcher(text); Matcher m = p.matcher(text);
if (all) { if (all) {
t.setTermText(m.replaceAll(replacement)); termAtt.setTermBuffer(m.replaceAll(replacement));
} else { } else {
t.setTermText(m.replaceFirst(replacement)); termAtt.setTermBuffer(m.replaceFirst(replacement));
} }
return t; return true;
} }
} }

View File

@ -20,6 +20,10 @@ package org.apache.solr.analysis;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.core.SolrConfig; import org.apache.solr.core.SolrConfig;
@ -111,6 +115,31 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
final Iterator<Token> iter = tokens.iterator(); final Iterator<Token> iter = tokens.iterator();
return new TokenStream() { return new TokenStream() {
@Override
public boolean incrementToken() throws IOException {
return super.incrementToken();
}
@Override
public void end() throws IOException {
super.end();
}
@Override
public Token next(Token reusableToken) throws IOException {
return super.next(reusableToken);
}
@Override
public void reset() throws IOException {
super.reset();
}
@Override
public void close() throws IOException {
super.close();
}
@Override @Override
public Token next() throws IOException { public Token next() throws IOException {
if( iter.hasNext() ) { if( iter.hasNext() ) {

View File

@ -54,7 +54,8 @@ public class PhoneticFilter extends TokenFilter
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
if( save != null ) { if( save != null ) {
clearAttributes(); restoreState(save); // clearAttributes(); // not currently necessary
restoreState(save);
save = null; save = null;
return true; return true;
} }

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils; import org.apache.solr.common.util.StrUtils;
@ -97,50 +98,35 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen
} }
} }
class SnowballPorterFilter extends TokenFilter { class SnowballPorterFilter extends TokenFilter {
private final CharArraySet protWords; private final CharArraySet protWords;
private SnowballProgram stemmer; private final SnowballProgram stemmer;
private final TermAttribute termAtt;
public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) { public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
super(source); super(source);
this.protWords = protWords; this.protWords = protWords;
this.stemmer = stemmer; this.stemmer = stemmer;
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
} }
/**
* the original code from lucene sandbox
* public final Token next() throws IOException {
* Token token = input.next();
* if (token == null)
* return null;
* stemmer.setCurrent(token.termText());
* try {
* stemMethod.invoke(stemmer, EMPTY_ARGS);
* } catch (Exception e) {
* throw new RuntimeException(e.toString());
* }
* return new Token(stemmer.getCurrent(),
* token.startOffset(), token.endOffset(), token.type());
* }
*/
@Override @Override
public Token next(Token token) throws IOException { public boolean incrementToken() throws IOException {
Token result = input.next(token); if (!input.incrementToken()) return false;
if (result != null) {
char[] termBuffer = result.termBuffer(); char[] termBuffer = termAtt.termBuffer();
int len = result.termLength(); int len = termAtt.termLength();
// if protected, don't stem. use this to avoid stemming collisions. // if protected, don't stem. use this to avoid stemming collisions.
if (protWords != null && protWords.contains(termBuffer, 0, len)) { if (protWords != null && protWords.contains(termBuffer, 0, len)) {
return result; return true;
}
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
stemmer.stem();
String newstr = stemmer.getCurrent();
result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
} }
return result;
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
stemmer.stem();
String newstr = stemmer.getCurrent();
termAtt.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
return true;
} }
} }

View File

@ -17,7 +17,10 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.*;
import java.io.Reader;
import java.io.IOException;
/** /**
* @version $Id$ * @version $Id$
@ -32,4 +35,45 @@ public abstract class SolrAnalyzer extends Analyzer {
public int getPositionIncrementGap(String fieldName) { public int getPositionIncrementGap(String fieldName) {
return posIncGap; return posIncGap;
} }
/** wrap the reader in a CharStream, if appropriate */
public Reader charStream(Reader reader){
return reader;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return getStream(fieldName, reader).getTokenStream();
}
public static class TokenStreamInfo {
private final Tokenizer tokenizer;
private final TokenStream tokenStream;
public TokenStreamInfo(Tokenizer tokenizer, TokenStream tokenStream) {
this.tokenizer = tokenizer;
this.tokenStream = tokenStream;
}
public Tokenizer getTokenizer() { return tokenizer; }
public TokenStream getTokenStream() { return tokenStream; }
}
public abstract TokenStreamInfo getStream(String fieldName, Reader reader);
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
// if (true) return tokenStream(fieldName, reader);
TokenStreamInfo tsi = (TokenStreamInfo)getPreviousTokenStream();
if (tsi != null) {
tsi.getTokenizer().reset(charStream(reader));
// the consumer will currently call reset() on the TokenStream to hit all the filters.
// this isn't necessarily guaranteed by the APIs... but is currently done
// by lucene indexing in DocInverterPerField, and in the QueryParser
return tsi.getTokenStream();
} else {
tsi = getStream(fieldName, reader);
setPreviousTokenStream(tsi);
return tsi.getTokenStream();
}
}
} }

View File

@ -205,4 +205,9 @@ public class SynonymFilter extends TokenFilter {
return result; return result;
} }
@Override
public void reset() throws IOException {
input.reset();
replacement = null;
}
} }

View File

@ -20,8 +20,10 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.Tokenizer;
import java.io.Reader; import java.io.Reader;
import java.io.IOException;
/** /**
* @version $Id$ * @version $Id$
@ -50,23 +52,14 @@ public class TokenizerChain extends SolrAnalyzer {
public TokenizerFactory getTokenizerFactory() { return tokenizer; } public TokenizerFactory getTokenizerFactory() { return tokenizer; }
public TokenFilterFactory[] getTokenFilterFactories() { return filters; } public TokenFilterFactory[] getTokenFilterFactories() { return filters; }
public Reader charStream(Reader reader){ @Override
if( charFilters != null && charFilters.length > 0 ){ public TokenStreamInfo getStream(String fieldName, Reader reader) {
CharStream cs = CharReader.get( reader ); Tokenizer tk = (Tokenizer)tokenizer.create(charStream(reader));
for (int i=0; i<charFilters.length; i++) { TokenStream ts = tk;
cs = charFilters[i].create(cs);
}
reader = cs;
}
return reader;
}
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream ts = tokenizer.create(charStream(reader));
for (int i=0; i<filters.length; i++) { for (int i=0; i<filters.length; i++) {
ts = filters[i].create(ts); ts = filters[i].create(ts);
} }
return ts; return new TokenStreamInfo(tk,ts);
} }
public String toString() { public String toString() {

View File

@ -18,6 +18,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.NumericTokenStream; import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.schema.DateField; import org.apache.solr.schema.DateField;
import static org.apache.solr.schema.TrieField.TrieTypes; import static org.apache.solr.schema.TrieField.TrieTypes;
@ -38,7 +39,6 @@ import java.io.Reader;
* @since solr 1.4 * @since solr 1.4
*/ */
public class TrieTokenizerFactory extends BaseTokenizerFactory { public class TrieTokenizerFactory extends BaseTokenizerFactory {
protected static final DateField dateField = new DateField();
protected final int precisionStep; protected final int precisionStep;
protected final TrieTypes type; protected final TrieTypes type;
@ -48,28 +48,71 @@ public class TrieTokenizerFactory extends BaseTokenizerFactory {
} }
public TokenStream create(Reader input) { public TokenStream create(Reader input) {
try { return new TrieTokenizer(input, type, precisionStep, TrieTokenizer.getNumericTokenStream(precisionStep));
StringBuilder builder = new StringBuilder(); }
char[] buf = new char[8]; }
int len;
while ((len = input.read(buf)) != -1) class TrieTokenizer extends Tokenizer {
builder.append(buf, 0, len); protected static final DateField dateField = new DateField();
protected final int precisionStep;
protected final TrieTypes type;
protected final NumericTokenStream ts;
static NumericTokenStream getNumericTokenStream(int precisionStep) {
return new NumericTokenStream(precisionStep);
}
public TrieTokenizer(Reader input, TrieTypes type, int precisionStep, NumericTokenStream ts) {
// must share the attribute source with the NumericTokenStream we delegate to
super(ts);
this.type = type;
this.precisionStep = precisionStep;
this.ts = ts;
try {
reset(input);
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
}
}
@Override
public void reset(Reader input) throws IOException {
try {
super.reset(input);
input = super.input;
char[] buf = new char[32];
int len = input.read(buf);
String v = new String(buf, 0, len);
switch (type) { switch (type) {
case INTEGER: case INTEGER:
return new NumericTokenStream(precisionStep).setIntValue(Integer.parseInt(builder.toString())); ts.setIntValue(Integer.parseInt(v));
break;
case FLOAT: case FLOAT:
return new NumericTokenStream(precisionStep).setFloatValue(Float.parseFloat(builder.toString())); ts.setFloatValue(Float.parseFloat(v));
break;
case LONG: case LONG:
return new NumericTokenStream(precisionStep).setLongValue(Long.parseLong(builder.toString())); ts.setLongValue(Long.parseLong(v));
break;
case DOUBLE: case DOUBLE:
return new NumericTokenStream(precisionStep).setDoubleValue(Double.parseDouble(builder.toString())); ts.setDoubleValue(Double.parseDouble(v));
break;
case DATE: case DATE:
return new NumericTokenStream(precisionStep).setLongValue(dateField.parseMath(null, builder.toString()).getTime()); ts.setLongValue(dateField.parseMath(null, v).getTime());
break;
default: default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field"); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
} }
} catch (IOException e) { } catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
} }
ts.reset();
} }
}
@Override
public boolean incrementToken() throws IOException {
return ts.incrementToken();
}
}

View File

@ -20,6 +20,8 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import java.io.IOException; import java.io.IOException;
@ -31,20 +33,24 @@ import java.io.IOException;
public final class TrimFilter extends TokenFilter { public final class TrimFilter extends TokenFilter {
final boolean updateOffsets; final boolean updateOffsets;
private final TermAttribute termAtt;
private final OffsetAttribute offsetAtt;
public TrimFilter(TokenStream in, boolean updateOffsets) { public TrimFilter(TokenStream in, boolean updateOffsets) {
super(in); super(in);
this.updateOffsets = updateOffsets; this.updateOffsets = updateOffsets;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
} }
@Override @Override
public final Token next(Token in) throws IOException { public boolean incrementToken() throws IOException {
Token t = input.next(in); if (!input.incrementToken()) return false;
if (null == t || null == t.termBuffer() || t.termLength() == 0){
return t; char[] termBuffer = termAtt.termBuffer();
} int len = termAtt.termLength();
char[] termBuffer = t.termBuffer();
int len = t.termLength();
int start = 0; int start = 0;
int end = 0; int end = 0;
int endOff = 0; int endOff = 0;
@ -59,24 +65,17 @@ public final class TrimFilter extends TokenFilter {
} }
if (start > 0 || end < len) { if (start > 0 || end < len) {
if (start < end) { if (start < end) {
t.setTermBuffer(t.termBuffer(), start, (end - start)); termAtt.setTermBuffer(termBuffer, start, (end - start));
} else { } else {
t.setTermLength(0); termAtt.setTermLength(0);
} }
if (updateOffsets) { if (updateOffsets) {
t.setStartOffset(t.startOffset() + start); int newStart = offsetAtt.startOffset()+start;
if (start < end) { int newEnd = offsetAtt.endOffset() - (start<end ? endOff:0);
t.setEndOffset(t.endOffset() - endOff); offsetAtt.setOffset(newStart, newEnd);
} //else if end is less than, start, then the term length is 0, so, no need to bother w/ the end offset
} }
/*t = new Token( t.termText().substring( start, end ),
t.startOffset()+start,
t.endOffset()-endOff,
t.type() );*/
} }
return t; return true;
} }
} }

View File

@ -657,6 +657,12 @@ final class WordDelimiterFilter extends TokenFilter {
} }
} }
@Override
public void reset() throws IOException {
input.reset();
queuePos=0;
queue.clear();
}
// questions: // questions:
// negative numbers? -42 indexed as just 42? // negative numbers? -42 indexed as just 42?

View File

@ -24,6 +24,7 @@ import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain; import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.SolrException;
import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryResponse; import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.schema.FieldType; import org.apache.solr.schema.FieldType;
@ -68,7 +69,14 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
Analyzer analyzer = context.getAnalyzer(); Analyzer analyzer = context.getAnalyzer();
if (!TokenizerChain.class.isInstance(analyzer)) { if (!TokenizerChain.class.isInstance(analyzer)) {
TokenStream tokenStream = analyzer.tokenStream(context.getFieldName(), new StringReader(value));
TokenStream tokenStream = null;
try {
tokenStream = analyzer.reusableTokenStream(context.getFieldName(), new StringReader(value));
tokenStream.reset();
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
NamedList<List<NamedList>> namedList = new SimpleOrderedMap<List<NamedList>>(); NamedList<List<NamedList>> namedList = new SimpleOrderedMap<List<NamedList>>();
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context)); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
return namedList; return namedList;

View File

@ -294,7 +294,9 @@ public class QueryElevationComponent extends SearchComponent implements SolrCore
return query; return query;
} }
StringBuilder norm = new StringBuilder(); StringBuilder norm = new StringBuilder();
TokenStream tokens = analyzer.tokenStream( null, new StringReader( query ) ); TokenStream tokens = analyzer.reusableTokenStream( "", new StringReader( query ) );
tokens.reset();
Token token = tokens.next(); Token token = tokens.next();
while( token != null ) { while( token != null ) {
norm.append( new String(token.termBuffer(), 0, token.termLength()) ); norm.append( new String(token.termBuffer(), 0, token.termLength()) );

View File

@ -160,7 +160,8 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException { private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<Token>(); Collection<Token> result = new ArrayList<Token>();
Token token = null; Token token = null;
TokenStream ts = analyzer.tokenStream("", new StringReader(q)); TokenStream ts = analyzer.reusableTokenStream("", new StringReader(q));
ts.reset();
while ((token = ts.next()) != null){ while ((token = ts.next()) != null){
result.add(token); result.add(token);
} }

View File

@ -286,7 +286,9 @@ public class DefaultSolrHighlighter extends SolrHighlighter
} }
catch (IllegalArgumentException e) { catch (IllegalArgumentException e) {
// fall back to anaylzer // fall back to anaylzer
tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[j])), 10); TokenStream ts = schema.getAnalyzer().reusableTokenStream(fieldName, new StringReader(docTexts[j]));
ts.reset();
tstream = new TokenOrderingFilter(ts, 10);
} }
Highlighter highlighter; Highlighter highlighter;

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.Fieldable;
import org.apache.solr.request.XMLWriter; import org.apache.solr.request.XMLWriter;
import org.apache.solr.request.TextResponseWriter; import org.apache.solr.request.TextResponseWriter;
@ -48,28 +49,43 @@ public class BoolField extends FieldType {
} }
// avoid instantiating every time... // avoid instantiating every time...
protected final static Token TRUE_TOKEN = new Token("T",0,1); protected final static char[] TRUE_TOKEN = {'T'};
protected final static Token FALSE_TOKEN = new Token("F",0,1); protected final static char[] FALSE_TOKEN = {'F'};
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// TODO: look into creating my own queryParser that can more efficiently // TODO: look into creating my own queryParser that can more efficiently
// handle single valued non-text fields (int,bool,etc) if needed. // handle single valued non-text fields (int,bool,etc) if needed.
protected final static Analyzer boolAnalyzer = new SolrAnalyzer() { protected final static Analyzer boolAnalyzer = new SolrAnalyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) { public TokenStreamInfo getStream(String fieldName, Reader reader) {
return new Tokenizer(reader) { Tokenizer tokenizer = new Tokenizer(reader) {
boolean done=false; final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
public Token next() throws IOException { boolean done = false;
if (done) return null;
done=true; @Override
int ch = input.read(); public void reset(Reader input) throws IOException {
if (ch==-1) return null; done = false;
return (ch=='t' || ch=='T' || ch=='1') ? TRUE_TOKEN : FALSE_TOKEN; super.reset(input);
} }
};
} @Override
}; public boolean incrementToken() throws IOException {
clearAttributes();
if (done) return false;
done = true;
int ch = input.read();
if (ch==-1) return false;
termAtt.setTermBuffer(
((ch=='t' || ch=='T' || ch=='1') ? TRUE_TOKEN : FALSE_TOKEN)
,0,1);
return true;
}
};
return new TokenStreamInfo(tokenizer, tokenizer);
}
};
public Analyzer getAnalyzer() { public Analyzer getAnalyzer() {
return boolAnalyzer; return boolAnalyzer;

View File

@ -23,6 +23,8 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TermRangeQuery;
@ -286,55 +288,38 @@ public abstract class FieldType extends FieldProperties {
return toInternal(val); return toInternal(val);
} }
/*********
// default analyzer for non-text fields.
// Only reads 80 bytes, but that should be plenty for a single value.
public Analyzer getAnalyzer() {
if (analyzer != null) return analyzer;
// the default analyzer...
return new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new Tokenizer(reader) {
final char[] cbuf = new char[80];
public Token next() throws IOException {
int n = input.read(cbuf,0,80);
if (n<=0) return null;
String s = toInternal(new String(cbuf,0,n));
return new Token(s,0,n);
};
};
}
};
}
**********/
/** /**
* Default analyzer for types that only produce 1 verbatim token... * Default analyzer for types that only produce 1 verbatim token...
* A maximum size of chars to be read must be specified * A maximum size of chars to be read must be specified
*/ */
protected final class DefaultAnalyzer extends SolrAnalyzer { protected class DefaultAnalyzer extends SolrAnalyzer {
final int maxChars; final int maxChars;
DefaultAnalyzer(int maxChars) { DefaultAnalyzer(int maxChars) {
this.maxChars=maxChars; this.maxChars=maxChars;
} }
public TokenStream tokenStream(String fieldName, Reader reader) { public TokenStreamInfo getStream(String fieldName, Reader reader) {
return new Tokenizer(reader) { Tokenizer ts = new Tokenizer(reader) {
char[] cbuf = new char[maxChars]; final char[] cbuf = new char[maxChars];
public Token next() throws IOException { final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
int n = input.read(cbuf,0,maxChars); int n = input.read(cbuf,0,maxChars);
if (n<=0) return null; if (n<=0) return false;
String s = toInternal(new String(cbuf,0,n)); // virtual func on parent String s = toInternal(new String(cbuf,0,n));
return new Token(s,0,n); termAtt.setTermBuffer(s);
}; offsetAtt.setOffset(0,n);
return true;
}
}; };
return new TokenStreamInfo(ts, ts);
} }
} }
/** /**
* Analyzer set by schema for text types to use when indexing fields * Analyzer set by schema for text types to use when indexing fields
* of this type, subclasses can set analyzer themselves or override * of this type, subclasses can set analyzer themselves or override

View File

@ -359,6 +359,11 @@ public final class IndexSchema {
return getAnalyzer(fieldName).tokenStream(fieldName,reader); return getAnalyzer(fieldName).tokenStream(fieldName,reader);
} }
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
return getAnalyzer(fieldName).reusableTokenStream(fieldName,reader);
}
@Override @Override
public int getPositionIncrementGap(String fieldName) { public int getPositionIncrementGap(String fieldName) {
return getAnalyzer(fieldName).getPositionIncrementGap(fieldName); return getAnalyzer(fieldName).getPositionIncrementGap(fieldName);

View File

@ -24,6 +24,7 @@ import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.SolrException;
import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType; import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TextField; import org.apache.solr.schema.TextField;
@ -65,7 +66,13 @@ public class FieldQParserPlugin extends QParserPlugin {
// Use the analyzer to get all the tokens, and then build a TermQuery, // Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count // PhraseQuery, or nothing based on the term count
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); TokenStream source = null;
try {
source = analyzer.reusableTokenStream(field, new StringReader(queryText));
source.reset();
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
ArrayList<Token> lst = new ArrayList<Token>(); ArrayList<Token> lst = new ArrayList<Token>();
Token t; Token t;
int positionCount = 0; int positionCount = 0;

View File

@ -22,8 +22,10 @@ import org.apache.lucene.document.Field;
import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.IndexSchema;
import org.apache.solr.util.AbstractSolrTestCase; import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.common.util.StrUtils;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
/** Bypass the normal Solr pipeline and just text indexing performance /** Bypass the normal Solr pipeline and just text indexing performance
* starting at the update handler. The same document is indexed repeatedly. * starting at the update handler. The same document is indexed repeatedly.
@ -39,6 +41,12 @@ public class TestIndexingPerformance extends AbstractSolrTestCase {
int iter=1000; int iter=1000;
String iterS = System.getProperty("iter"); String iterS = System.getProperty("iter");
if (iterS != null) iter=Integer.parseInt(iterS); if (iterS != null) iter=Integer.parseInt(iterS);
boolean includeDoc = Boolean.parseBoolean(System.getProperty("includeDoc","true")); // include the time to create the document
String doc = System.getProperty("doc");
if (doc != null) {
StrUtils.splitSmart(doc,",",true);
}
SolrQueryRequest req = lrf.makeRequest(); SolrQueryRequest req = lrf.makeRequest();
IndexSchema schema = req.getSchema(); IndexSchema schema = req.getSchema();
@ -53,23 +61,43 @@ public class TestIndexingPerformance extends AbstractSolrTestCase {
,"text","just how fast is this text indexing?" ,"text","just how fast is this text indexing?"
}; };
Document ldoc = new Document();
for (int i=0; i<fields.length; i+=2) { /***
String field = fields[i]; String[] fields = {
String val = fields[i+1]; "a_i","1"
Field f = schema.getField(field).createField(val, 1.0f); ,"b_i","2"
ldoc.add(f); ,"c_i","3"
} ,"d_i","4"
,"e_i","5"
,"f_i","6"
,"g_i","7"
,"h_i","8"
,"i_i","9"
,"j_i","0"
,"k_i","0"
};
***/
long start = System.currentTimeMillis();
AddUpdateCommand add = new AddUpdateCommand(); AddUpdateCommand add = new AddUpdateCommand();
add.allowDups = true; add.allowDups = true;
add.doc = ldoc;
long start = System.currentTimeMillis();
for (int i=0; i<iter; i++) { for (int i=0; i<iter; i++) {
if (includeDoc || add.doc==null) {
add.doc = new Document();
for (int j=0; j<fields.length; j+=2) {
String field = fields[j];
String val = fields[j+1];
Field f = schema.getField(field).createField(val, 1.0f);
add.doc.add(f);
}
}
updateHandler.addDoc(add); updateHandler.addDoc(add);
} }
long end = System.currentTimeMillis(); long end = System.currentTimeMillis();
System.out.println("includeDoc="+includeDoc+" doc="+ Arrays.toString(fields));
System.out.println("iter="+iter +" time=" + (end-start) + " throughput=" + ((long)iter*1000)/(end-start)); System.out.println("iter="+iter +" time=" + (end-start) + " throughput=" + ((long)iter*1000)/(end-start));
//discard all the changes //discard all the changes