mirror of https://github.com/apache/lucene.git
SOLR-1353: Implement and use reusable token streams for analysis
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@804726 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b45440d7c7
commit
bb76127225
|
@ -320,6 +320,9 @@ Optimizations
|
||||||
15. SOLR-1150: Load Documents for Highlighting one at a time rather than
|
15. SOLR-1150: Load Documents for Highlighting one at a time rather than
|
||||||
all at once to avoid OOM with many large Documents. (Siddharth Gargate via Mark Miller)
|
all at once to avoid OOM with many large Documents. (Siddharth Gargate via Mark Miller)
|
||||||
|
|
||||||
|
16. SOLR-1353: Implement and use reusable token streams for analysis. (yonik)
|
||||||
|
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
----------------------
|
----------------------
|
||||||
1. SOLR-774: Fixed logging level display (Sean Timm via Otis Gospodnetic)
|
1. SOLR-774: Fixed logging level display (Sean Timm via Otis Gospodnetic)
|
||||||
|
|
|
@ -17,10 +17,8 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -190,52 +188,53 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
||||||
* This is package protected since it is not useful without the Factory
|
* This is package protected since it is not useful without the Factory
|
||||||
*/
|
*/
|
||||||
class CapitalizationFilter extends TokenFilter {
|
class CapitalizationFilter extends TokenFilter {
|
||||||
protected final CapitalizationFilterFactory factory;
|
private final CapitalizationFilterFactory factory;
|
||||||
|
private final TermAttribute termAtt;
|
||||||
|
|
||||||
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
|
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
|
||||||
super(in);
|
super(in);
|
||||||
this.factory = factory;
|
this.factory = factory;
|
||||||
|
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Token next(Token token) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
Token t = input.next(token);
|
if (!input.incrementToken()) return false;
|
||||||
if (t != null) {
|
|
||||||
|
|
||||||
char[] termBuffer = t.termBuffer();
|
char[] termBuffer = termAtt.termBuffer();
|
||||||
int termBufferLength = t.termLength();
|
int termBufferLength = termAtt.termLength();
|
||||||
char[] backup = null;
|
char[] backup = null;
|
||||||
if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) {
|
if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) {
|
||||||
//make a backup in case we exceed the word count
|
//make a backup in case we exceed the word count
|
||||||
System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
|
System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
|
||||||
}
|
}
|
||||||
if (termBufferLength < factory.maxTokenLength) {
|
if (termBufferLength < factory.maxTokenLength) {
|
||||||
int wordCount = 0;
|
int wordCount = 0;
|
||||||
|
|
||||||
int lastWordStart = 0;
|
int lastWordStart = 0;
|
||||||
for (int i = 0; i < termBufferLength; i++) {
|
for (int i = 0; i < termBufferLength; i++) {
|
||||||
char c = termBuffer[i];
|
char c = termBuffer[i];
|
||||||
if (c <= ' ' || c == '.') {
|
if (c <= ' ' || c == '.') {
|
||||||
int len = i - lastWordStart;
|
int len = i - lastWordStart;
|
||||||
if (len > 0) {
|
if (len > 0) {
|
||||||
factory.processWord(termBuffer, lastWordStart, len, wordCount++);
|
factory.processWord(termBuffer, lastWordStart, len, wordCount++);
|
||||||
lastWordStart = i + 1;
|
lastWordStart = i + 1;
|
||||||
i++;
|
i++;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// process the last word
|
// process the last word
|
||||||
if (lastWordStart < termBufferLength) {
|
if (lastWordStart < termBufferLength) {
|
||||||
factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
|
factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wordCount > factory.maxWordCount) {
|
if (wordCount > factory.maxWordCount) {
|
||||||
t.setTermBuffer(backup, 0, termBufferLength);
|
termAtt.setTermBuffer(backup, 0, termBufferLength);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return t;
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,7 +50,8 @@ public class DoubleMetaphoneFilter extends TokenFilter {
|
||||||
for(;;) {
|
for(;;) {
|
||||||
|
|
||||||
if (!remainingTokens.isEmpty()) {
|
if (!remainingTokens.isEmpty()) {
|
||||||
clearAttributes(); restoreState(remainingTokens.removeFirst());
|
// clearAttributes(); // not currently necessary
|
||||||
|
restoreState(remainingTokens.removeFirst());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
import org.apache.solr.common.util.StrUtils;
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
import org.tartarus.snowball.SnowballProgram;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
@ -75,50 +76,9 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement
|
||||||
* English Porter2 filter that doesn't use reflection to
|
* English Porter2 filter that doesn't use reflection to
|
||||||
* adapt lucene to the snowball stemmer code.
|
* adapt lucene to the snowball stemmer code.
|
||||||
*/
|
*/
|
||||||
class EnglishPorterFilter extends TokenFilter {
|
@Deprecated
|
||||||
private final CharArraySet protWords;
|
class EnglishPorterFilter extends SnowballPorterFilter {
|
||||||
private org.tartarus.snowball.ext.EnglishStemmer stemmer;
|
|
||||||
|
|
||||||
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
|
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
|
||||||
super(source);
|
super(source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords);
|
||||||
this.protWords = protWords;
|
|
||||||
stemmer = new org.tartarus.snowball.ext.EnglishStemmer();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* the original code from lucene sandbox
|
|
||||||
* public final Token next() throws IOException {
|
|
||||||
* Token token = input.next();
|
|
||||||
* if (token == null)
|
|
||||||
* return null;
|
|
||||||
* stemmer.setCurrent(token.termText());
|
|
||||||
* try {
|
|
||||||
* stemMethod.invoke(stemmer, EMPTY_ARGS);
|
|
||||||
* } catch (Exception e) {
|
|
||||||
* throw new RuntimeException(e.toString());
|
|
||||||
* }
|
|
||||||
* return new Token(stemmer.getCurrent(),
|
|
||||||
* token.startOffset(), token.endOffset(), token.type());
|
|
||||||
* }
|
|
||||||
*/
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Token next(Token token) throws IOException {
|
|
||||||
Token result = input.next(token);
|
|
||||||
if (result != null) {
|
|
||||||
char[] termBuffer = result.termBuffer();
|
|
||||||
int len = result.termLength();
|
|
||||||
// if protected, don't stem. use this to avoid stemming collisions.
|
|
||||||
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
|
|
||||||
stemmer.stem();
|
|
||||||
String newstr = stemmer.getCurrent();
|
|
||||||
result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,8 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.tartarus.snowball.SnowballProgram;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -33,21 +35,20 @@ import java.util.Set;
|
||||||
* @since solr 1.3
|
* @since solr 1.3
|
||||||
*/
|
*/
|
||||||
public final class KeepWordFilter extends TokenFilter {
|
public final class KeepWordFilter extends TokenFilter {
|
||||||
final CharArraySet words;
|
private final CharArraySet words;
|
||||||
|
private final TermAttribute termAtt;
|
||||||
|
|
||||||
|
|
||||||
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
|
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
|
||||||
super(in);
|
super(in);
|
||||||
this.words = new CharArraySet(words, ignoreCase);
|
this.words = new CharArraySet(words, ignoreCase);
|
||||||
|
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final Token next(Token in) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
for (Token token=input.next(in); token!=null; token=input.next(token)) {
|
while (input.incrementToken()) {
|
||||||
if( words.contains( token.termBuffer(), 0, token.termLength() ) ) {
|
if (words.contains(termAtt.termBuffer(), 0, termAtt.termLength())) return true;
|
||||||
return token;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,9 +20,12 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.Set;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.CharBuffer;
|
import java.nio.CharBuffer;
|
||||||
|
|
||||||
|
@ -40,10 +43,10 @@ import java.nio.CharBuffer;
|
||||||
* @see Pattern
|
* @see Pattern
|
||||||
*/
|
*/
|
||||||
public final class PatternReplaceFilter extends TokenFilter {
|
public final class PatternReplaceFilter extends TokenFilter {
|
||||||
Pattern p;
|
private final Pattern p;
|
||||||
String replacement;
|
private final String replacement;
|
||||||
boolean all = true;
|
private final boolean all;
|
||||||
|
private final TermAttribute termAtt;
|
||||||
/**
|
/**
|
||||||
* Constructs an instance to replace either the first, or all occurances
|
* Constructs an instance to replace either the first, or all occurances
|
||||||
*
|
*
|
||||||
|
@ -63,21 +66,23 @@ public final class PatternReplaceFilter extends TokenFilter {
|
||||||
this.p=p;
|
this.p=p;
|
||||||
this.replacement = (null == replacement) ? "" : replacement;
|
this.replacement = (null == replacement) ? "" : replacement;
|
||||||
this.all=all;
|
this.all=all;
|
||||||
|
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(Token in) throws IOException {
|
@Override
|
||||||
Token t = input.next(in);
|
public boolean incrementToken() throws IOException {
|
||||||
if (t == null)
|
if (!input.incrementToken()) return false;
|
||||||
return null;
|
|
||||||
CharSequence text = CharBuffer.wrap(t.termBuffer(), 0, t.termLength());
|
CharSequence text = CharBuffer.wrap(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||||
Matcher m = p.matcher(text);
|
Matcher m = p.matcher(text);
|
||||||
|
|
||||||
if (all) {
|
if (all) {
|
||||||
t.setTermText(m.replaceAll(replacement));
|
termAtt.setTermBuffer(m.replaceAll(replacement));
|
||||||
} else {
|
} else {
|
||||||
t.setTermText(m.replaceFirst(replacement));
|
termAtt.setTermBuffer(m.replaceFirst(replacement));
|
||||||
}
|
}
|
||||||
|
|
||||||
return t;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,10 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.CharStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.core.SolrConfig;
|
import org.apache.solr.core.SolrConfig;
|
||||||
|
|
||||||
|
@ -111,6 +115,31 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory
|
||||||
|
|
||||||
final Iterator<Token> iter = tokens.iterator();
|
final Iterator<Token> iter = tokens.iterator();
|
||||||
return new TokenStream() {
|
return new TokenStream() {
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
return super.incrementToken();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void end() throws IOException {
|
||||||
|
super.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Token next(Token reusableToken) throws IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
super.close();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Token next() throws IOException {
|
public Token next() throws IOException {
|
||||||
if( iter.hasNext() ) {
|
if( iter.hasNext() ) {
|
||||||
|
|
|
@ -54,7 +54,8 @@ public class PhoneticFilter extends TokenFilter
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if( save != null ) {
|
if( save != null ) {
|
||||||
clearAttributes(); restoreState(save);
|
// clearAttributes(); // not currently necessary
|
||||||
|
restoreState(save);
|
||||||
save = null;
|
save = null;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
import org.apache.solr.common.util.StrUtils;
|
||||||
|
@ -97,50 +98,35 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class SnowballPorterFilter extends TokenFilter {
|
class SnowballPorterFilter extends TokenFilter {
|
||||||
private final CharArraySet protWords;
|
private final CharArraySet protWords;
|
||||||
private SnowballProgram stemmer;
|
private final SnowballProgram stemmer;
|
||||||
|
private final TermAttribute termAtt;
|
||||||
|
|
||||||
public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
|
public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) {
|
||||||
super(source);
|
super(source);
|
||||||
this.protWords = protWords;
|
this.protWords = protWords;
|
||||||
this.stemmer = stemmer;
|
this.stemmer = stemmer;
|
||||||
|
this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* the original code from lucene sandbox
|
|
||||||
* public final Token next() throws IOException {
|
|
||||||
* Token token = input.next();
|
|
||||||
* if (token == null)
|
|
||||||
* return null;
|
|
||||||
* stemmer.setCurrent(token.termText());
|
|
||||||
* try {
|
|
||||||
* stemMethod.invoke(stemmer, EMPTY_ARGS);
|
|
||||||
* } catch (Exception e) {
|
|
||||||
* throw new RuntimeException(e.toString());
|
|
||||||
* }
|
|
||||||
* return new Token(stemmer.getCurrent(),
|
|
||||||
* token.startOffset(), token.endOffset(), token.type());
|
|
||||||
* }
|
|
||||||
*/
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Token next(Token token) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
Token result = input.next(token);
|
if (!input.incrementToken()) return false;
|
||||||
if (result != null) {
|
|
||||||
char[] termBuffer = result.termBuffer();
|
char[] termBuffer = termAtt.termBuffer();
|
||||||
int len = result.termLength();
|
int len = termAtt.termLength();
|
||||||
// if protected, don't stem. use this to avoid stemming collisions.
|
// if protected, don't stem. use this to avoid stemming collisions.
|
||||||
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
if (protWords != null && protWords.contains(termBuffer, 0, len)) {
|
||||||
return result;
|
return true;
|
||||||
}
|
|
||||||
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
|
|
||||||
stemmer.stem();
|
|
||||||
String newstr = stemmer.getCurrent();
|
|
||||||
result.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
|
|
||||||
}
|
}
|
||||||
return result;
|
|
||||||
|
stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
|
||||||
|
stemmer.stem();
|
||||||
|
String newstr = stemmer.getCurrent();
|
||||||
|
termAtt.setTermBuffer(newstr.toCharArray(), 0, newstr.length());
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,10 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.*;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
@ -32,4 +35,45 @@ public abstract class SolrAnalyzer extends Analyzer {
|
||||||
public int getPositionIncrementGap(String fieldName) {
|
public int getPositionIncrementGap(String fieldName) {
|
||||||
return posIncGap;
|
return posIncGap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** wrap the reader in a CharStream, if appropriate */
|
||||||
|
public Reader charStream(Reader reader){
|
||||||
|
return reader;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
return getStream(fieldName, reader).getTokenStream();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class TokenStreamInfo {
|
||||||
|
private final Tokenizer tokenizer;
|
||||||
|
private final TokenStream tokenStream;
|
||||||
|
public TokenStreamInfo(Tokenizer tokenizer, TokenStream tokenStream) {
|
||||||
|
this.tokenizer = tokenizer;
|
||||||
|
this.tokenStream = tokenStream;
|
||||||
|
}
|
||||||
|
public Tokenizer getTokenizer() { return tokenizer; }
|
||||||
|
public TokenStream getTokenStream() { return tokenStream; }
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public abstract TokenStreamInfo getStream(String fieldName, Reader reader);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
|
// if (true) return tokenStream(fieldName, reader);
|
||||||
|
TokenStreamInfo tsi = (TokenStreamInfo)getPreviousTokenStream();
|
||||||
|
if (tsi != null) {
|
||||||
|
tsi.getTokenizer().reset(charStream(reader));
|
||||||
|
// the consumer will currently call reset() on the TokenStream to hit all the filters.
|
||||||
|
// this isn't necessarily guaranteed by the APIs... but is currently done
|
||||||
|
// by lucene indexing in DocInverterPerField, and in the QueryParser
|
||||||
|
return tsi.getTokenStream();
|
||||||
|
} else {
|
||||||
|
tsi = getStream(fieldName, reader);
|
||||||
|
setPreviousTokenStream(tsi);
|
||||||
|
return tsi.getTokenStream();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -205,4 +205,9 @@ public class SynonymFilter extends TokenFilter {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
input.reset();
|
||||||
|
replacement = null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,8 +20,10 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.CharStream;
|
import org.apache.lucene.analysis.CharStream;
|
||||||
import org.apache.lucene.analysis.CharReader;
|
import org.apache.lucene.analysis.CharReader;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
@ -50,23 +52,14 @@ public class TokenizerChain extends SolrAnalyzer {
|
||||||
public TokenizerFactory getTokenizerFactory() { return tokenizer; }
|
public TokenizerFactory getTokenizerFactory() { return tokenizer; }
|
||||||
public TokenFilterFactory[] getTokenFilterFactories() { return filters; }
|
public TokenFilterFactory[] getTokenFilterFactories() { return filters; }
|
||||||
|
|
||||||
public Reader charStream(Reader reader){
|
@Override
|
||||||
if( charFilters != null && charFilters.length > 0 ){
|
public TokenStreamInfo getStream(String fieldName, Reader reader) {
|
||||||
CharStream cs = CharReader.get( reader );
|
Tokenizer tk = (Tokenizer)tokenizer.create(charStream(reader));
|
||||||
for (int i=0; i<charFilters.length; i++) {
|
TokenStream ts = tk;
|
||||||
cs = charFilters[i].create(cs);
|
|
||||||
}
|
|
||||||
reader = cs;
|
|
||||||
}
|
|
||||||
return reader;
|
|
||||||
}
|
|
||||||
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
|
||||||
TokenStream ts = tokenizer.create(charStream(reader));
|
|
||||||
for (int i=0; i<filters.length; i++) {
|
for (int i=0; i<filters.length; i++) {
|
||||||
ts = filters[i].create(ts);
|
ts = filters[i].create(ts);
|
||||||
}
|
}
|
||||||
return ts;
|
return new TokenStreamInfo(tk,ts);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.NumericTokenStream;
|
import org.apache.lucene.analysis.NumericTokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.schema.DateField;
|
import org.apache.solr.schema.DateField;
|
||||||
import static org.apache.solr.schema.TrieField.TrieTypes;
|
import static org.apache.solr.schema.TrieField.TrieTypes;
|
||||||
|
@ -38,7 +39,6 @@ import java.io.Reader;
|
||||||
* @since solr 1.4
|
* @since solr 1.4
|
||||||
*/
|
*/
|
||||||
public class TrieTokenizerFactory extends BaseTokenizerFactory {
|
public class TrieTokenizerFactory extends BaseTokenizerFactory {
|
||||||
protected static final DateField dateField = new DateField();
|
|
||||||
protected final int precisionStep;
|
protected final int precisionStep;
|
||||||
protected final TrieTypes type;
|
protected final TrieTypes type;
|
||||||
|
|
||||||
|
@ -48,28 +48,71 @@ public class TrieTokenizerFactory extends BaseTokenizerFactory {
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenStream create(Reader input) {
|
public TokenStream create(Reader input) {
|
||||||
try {
|
return new TrieTokenizer(input, type, precisionStep, TrieTokenizer.getNumericTokenStream(precisionStep));
|
||||||
StringBuilder builder = new StringBuilder();
|
}
|
||||||
char[] buf = new char[8];
|
}
|
||||||
int len;
|
|
||||||
while ((len = input.read(buf)) != -1)
|
class TrieTokenizer extends Tokenizer {
|
||||||
builder.append(buf, 0, len);
|
protected static final DateField dateField = new DateField();
|
||||||
|
protected final int precisionStep;
|
||||||
|
protected final TrieTypes type;
|
||||||
|
protected final NumericTokenStream ts;
|
||||||
|
|
||||||
|
static NumericTokenStream getNumericTokenStream(int precisionStep) {
|
||||||
|
return new NumericTokenStream(precisionStep);
|
||||||
|
}
|
||||||
|
|
||||||
|
public TrieTokenizer(Reader input, TrieTypes type, int precisionStep, NumericTokenStream ts) {
|
||||||
|
// must share the attribute source with the NumericTokenStream we delegate to
|
||||||
|
super(ts);
|
||||||
|
this.type = type;
|
||||||
|
this.precisionStep = precisionStep;
|
||||||
|
this.ts = ts;
|
||||||
|
|
||||||
|
try {
|
||||||
|
reset(input);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset(Reader input) throws IOException {
|
||||||
|
try {
|
||||||
|
super.reset(input);
|
||||||
|
input = super.input;
|
||||||
|
char[] buf = new char[32];
|
||||||
|
int len = input.read(buf);
|
||||||
|
String v = new String(buf, 0, len);
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case INTEGER:
|
case INTEGER:
|
||||||
return new NumericTokenStream(precisionStep).setIntValue(Integer.parseInt(builder.toString()));
|
ts.setIntValue(Integer.parseInt(v));
|
||||||
|
break;
|
||||||
case FLOAT:
|
case FLOAT:
|
||||||
return new NumericTokenStream(precisionStep).setFloatValue(Float.parseFloat(builder.toString()));
|
ts.setFloatValue(Float.parseFloat(v));
|
||||||
|
break;
|
||||||
case LONG:
|
case LONG:
|
||||||
return new NumericTokenStream(precisionStep).setLongValue(Long.parseLong(builder.toString()));
|
ts.setLongValue(Long.parseLong(v));
|
||||||
|
break;
|
||||||
case DOUBLE:
|
case DOUBLE:
|
||||||
return new NumericTokenStream(precisionStep).setDoubleValue(Double.parseDouble(builder.toString()));
|
ts.setDoubleValue(Double.parseDouble(v));
|
||||||
|
break;
|
||||||
case DATE:
|
case DATE:
|
||||||
return new NumericTokenStream(precisionStep).setLongValue(dateField.parseMath(null, builder.toString()).getTime());
|
ts.setLongValue(dateField.parseMath(null, v).getTime());
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ts.reset();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
return ts.incrementToken();
|
||||||
|
}
|
||||||
|
}
|
|
@ -20,6 +20,8 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -31,20 +33,24 @@ import java.io.IOException;
|
||||||
public final class TrimFilter extends TokenFilter {
|
public final class TrimFilter extends TokenFilter {
|
||||||
|
|
||||||
final boolean updateOffsets;
|
final boolean updateOffsets;
|
||||||
|
private final TermAttribute termAtt;
|
||||||
|
private final OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
|
|
||||||
public TrimFilter(TokenStream in, boolean updateOffsets) {
|
public TrimFilter(TokenStream in, boolean updateOffsets) {
|
||||||
super(in);
|
super(in);
|
||||||
this.updateOffsets = updateOffsets;
|
this.updateOffsets = updateOffsets;
|
||||||
|
|
||||||
|
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final Token next(Token in) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
Token t = input.next(in);
|
if (!input.incrementToken()) return false;
|
||||||
if (null == t || null == t.termBuffer() || t.termLength() == 0){
|
|
||||||
return t;
|
char[] termBuffer = termAtt.termBuffer();
|
||||||
}
|
int len = termAtt.termLength();
|
||||||
char[] termBuffer = t.termBuffer();
|
|
||||||
int len = t.termLength();
|
|
||||||
int start = 0;
|
int start = 0;
|
||||||
int end = 0;
|
int end = 0;
|
||||||
int endOff = 0;
|
int endOff = 0;
|
||||||
|
@ -59,24 +65,17 @@ public final class TrimFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
if (start > 0 || end < len) {
|
if (start > 0 || end < len) {
|
||||||
if (start < end) {
|
if (start < end) {
|
||||||
t.setTermBuffer(t.termBuffer(), start, (end - start));
|
termAtt.setTermBuffer(termBuffer, start, (end - start));
|
||||||
} else {
|
} else {
|
||||||
t.setTermLength(0);
|
termAtt.setTermLength(0);
|
||||||
}
|
}
|
||||||
if (updateOffsets) {
|
if (updateOffsets) {
|
||||||
t.setStartOffset(t.startOffset() + start);
|
int newStart = offsetAtt.startOffset()+start;
|
||||||
if (start < end) {
|
int newEnd = offsetAtt.endOffset() - (start<end ? endOff:0);
|
||||||
t.setEndOffset(t.endOffset() - endOff);
|
offsetAtt.setOffset(newStart, newEnd);
|
||||||
} //else if end is less than, start, then the term length is 0, so, no need to bother w/ the end offset
|
|
||||||
}
|
}
|
||||||
/*t = new Token( t.termText().substring( start, end ),
|
|
||||||
t.startOffset()+start,
|
|
||||||
t.endOffset()-endOff,
|
|
||||||
t.type() );*/
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return t;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -657,6 +657,12 @@ final class WordDelimiterFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
input.reset();
|
||||||
|
queuePos=0;
|
||||||
|
queue.clear();
|
||||||
|
}
|
||||||
|
|
||||||
// questions:
|
// questions:
|
||||||
// negative numbers? -42 indexed as just 42?
|
// negative numbers? -42 indexed as just 42?
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.solr.analysis.TokenFilterFactory;
|
||||||
import org.apache.solr.analysis.TokenizerChain;
|
import org.apache.solr.analysis.TokenizerChain;
|
||||||
import org.apache.solr.common.util.NamedList;
|
import org.apache.solr.common.util.NamedList;
|
||||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.request.SolrQueryRequest;
|
import org.apache.solr.request.SolrQueryRequest;
|
||||||
import org.apache.solr.request.SolrQueryResponse;
|
import org.apache.solr.request.SolrQueryResponse;
|
||||||
import org.apache.solr.schema.FieldType;
|
import org.apache.solr.schema.FieldType;
|
||||||
|
@ -68,7 +69,14 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
||||||
Analyzer analyzer = context.getAnalyzer();
|
Analyzer analyzer = context.getAnalyzer();
|
||||||
|
|
||||||
if (!TokenizerChain.class.isInstance(analyzer)) {
|
if (!TokenizerChain.class.isInstance(analyzer)) {
|
||||||
TokenStream tokenStream = analyzer.tokenStream(context.getFieldName(), new StringReader(value));
|
|
||||||
|
TokenStream tokenStream = null;
|
||||||
|
try {
|
||||||
|
tokenStream = analyzer.reusableTokenStream(context.getFieldName(), new StringReader(value));
|
||||||
|
tokenStream.reset();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
|
||||||
|
}
|
||||||
NamedList<List<NamedList>> namedList = new SimpleOrderedMap<List<NamedList>>();
|
NamedList<List<NamedList>> namedList = new SimpleOrderedMap<List<NamedList>>();
|
||||||
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
|
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
|
||||||
return namedList;
|
return namedList;
|
||||||
|
|
|
@ -294,7 +294,9 @@ public class QueryElevationComponent extends SearchComponent implements SolrCore
|
||||||
return query;
|
return query;
|
||||||
}
|
}
|
||||||
StringBuilder norm = new StringBuilder();
|
StringBuilder norm = new StringBuilder();
|
||||||
TokenStream tokens = analyzer.tokenStream( null, new StringReader( query ) );
|
TokenStream tokens = analyzer.reusableTokenStream( "", new StringReader( query ) );
|
||||||
|
tokens.reset();
|
||||||
|
|
||||||
Token token = tokens.next();
|
Token token = tokens.next();
|
||||||
while( token != null ) {
|
while( token != null ) {
|
||||||
norm.append( new String(token.termBuffer(), 0, token.termLength()) );
|
norm.append( new String(token.termBuffer(), 0, token.termLength()) );
|
||||||
|
|
|
@ -160,7 +160,8 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
||||||
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
|
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
|
||||||
Collection<Token> result = new ArrayList<Token>();
|
Collection<Token> result = new ArrayList<Token>();
|
||||||
Token token = null;
|
Token token = null;
|
||||||
TokenStream ts = analyzer.tokenStream("", new StringReader(q));
|
TokenStream ts = analyzer.reusableTokenStream("", new StringReader(q));
|
||||||
|
ts.reset();
|
||||||
while ((token = ts.next()) != null){
|
while ((token = ts.next()) != null){
|
||||||
result.add(token);
|
result.add(token);
|
||||||
}
|
}
|
||||||
|
|
|
@ -286,7 +286,9 @@ public class DefaultSolrHighlighter extends SolrHighlighter
|
||||||
}
|
}
|
||||||
catch (IllegalArgumentException e) {
|
catch (IllegalArgumentException e) {
|
||||||
// fall back to anaylzer
|
// fall back to anaylzer
|
||||||
tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[j])), 10);
|
TokenStream ts = schema.getAnalyzer().reusableTokenStream(fieldName, new StringReader(docTexts[j]));
|
||||||
|
ts.reset();
|
||||||
|
tstream = new TokenOrderingFilter(ts, 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
Highlighter highlighter;
|
Highlighter highlighter;
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Fieldable;
|
import org.apache.lucene.document.Fieldable;
|
||||||
import org.apache.solr.request.XMLWriter;
|
import org.apache.solr.request.XMLWriter;
|
||||||
import org.apache.solr.request.TextResponseWriter;
|
import org.apache.solr.request.TextResponseWriter;
|
||||||
|
@ -48,28 +49,43 @@ public class BoolField extends FieldType {
|
||||||
}
|
}
|
||||||
|
|
||||||
// avoid instantiating every time...
|
// avoid instantiating every time...
|
||||||
protected final static Token TRUE_TOKEN = new Token("T",0,1);
|
protected final static char[] TRUE_TOKEN = {'T'};
|
||||||
protected final static Token FALSE_TOKEN = new Token("F",0,1);
|
protected final static char[] FALSE_TOKEN = {'F'};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// TODO: look into creating my own queryParser that can more efficiently
|
// TODO: look into creating my own queryParser that can more efficiently
|
||||||
// handle single valued non-text fields (int,bool,etc) if needed.
|
// handle single valued non-text fields (int,bool,etc) if needed.
|
||||||
|
|
||||||
|
|
||||||
protected final static Analyzer boolAnalyzer = new SolrAnalyzer() {
|
protected final static Analyzer boolAnalyzer = new SolrAnalyzer() {
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStreamInfo getStream(String fieldName, Reader reader) {
|
||||||
return new Tokenizer(reader) {
|
Tokenizer tokenizer = new Tokenizer(reader) {
|
||||||
boolean done=false;
|
final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
public Token next() throws IOException {
|
boolean done = false;
|
||||||
if (done) return null;
|
|
||||||
done=true;
|
@Override
|
||||||
int ch = input.read();
|
public void reset(Reader input) throws IOException {
|
||||||
if (ch==-1) return null;
|
done = false;
|
||||||
return (ch=='t' || ch=='T' || ch=='1') ? TRUE_TOKEN : FALSE_TOKEN;
|
super.reset(input);
|
||||||
}
|
}
|
||||||
};
|
|
||||||
}
|
@Override
|
||||||
};
|
public boolean incrementToken() throws IOException {
|
||||||
|
clearAttributes();
|
||||||
|
if (done) return false;
|
||||||
|
done = true;
|
||||||
|
int ch = input.read();
|
||||||
|
if (ch==-1) return false;
|
||||||
|
termAtt.setTermBuffer(
|
||||||
|
((ch=='t' || ch=='T' || ch=='1') ? TRUE_TOKEN : FALSE_TOKEN)
|
||||||
|
,0,1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return new TokenStreamInfo(tokenizer, tokenizer);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
public Analyzer getAnalyzer() {
|
public Analyzer getAnalyzer() {
|
||||||
return boolAnalyzer;
|
return boolAnalyzer;
|
||||||
|
|
|
@ -23,6 +23,8 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.search.SortField;
|
import org.apache.lucene.search.SortField;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.TermRangeQuery;
|
import org.apache.lucene.search.TermRangeQuery;
|
||||||
|
@ -286,55 +288,38 @@ public abstract class FieldType extends FieldProperties {
|
||||||
return toInternal(val);
|
return toInternal(val);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*********
|
|
||||||
// default analyzer for non-text fields.
|
|
||||||
// Only reads 80 bytes, but that should be plenty for a single value.
|
|
||||||
public Analyzer getAnalyzer() {
|
|
||||||
if (analyzer != null) return analyzer;
|
|
||||||
|
|
||||||
// the default analyzer...
|
|
||||||
return new Analyzer() {
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
|
||||||
return new Tokenizer(reader) {
|
|
||||||
final char[] cbuf = new char[80];
|
|
||||||
public Token next() throws IOException {
|
|
||||||
int n = input.read(cbuf,0,80);
|
|
||||||
if (n<=0) return null;
|
|
||||||
String s = toInternal(new String(cbuf,0,n));
|
|
||||||
return new Token(s,0,n);
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
**********/
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Default analyzer for types that only produce 1 verbatim token...
|
* Default analyzer for types that only produce 1 verbatim token...
|
||||||
* A maximum size of chars to be read must be specified
|
* A maximum size of chars to be read must be specified
|
||||||
*/
|
*/
|
||||||
protected final class DefaultAnalyzer extends SolrAnalyzer {
|
protected class DefaultAnalyzer extends SolrAnalyzer {
|
||||||
final int maxChars;
|
final int maxChars;
|
||||||
|
|
||||||
DefaultAnalyzer(int maxChars) {
|
DefaultAnalyzer(int maxChars) {
|
||||||
this.maxChars=maxChars;
|
this.maxChars=maxChars;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStreamInfo getStream(String fieldName, Reader reader) {
|
||||||
return new Tokenizer(reader) {
|
Tokenizer ts = new Tokenizer(reader) {
|
||||||
char[] cbuf = new char[maxChars];
|
final char[] cbuf = new char[maxChars];
|
||||||
public Token next() throws IOException {
|
final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
clearAttributes();
|
||||||
int n = input.read(cbuf,0,maxChars);
|
int n = input.read(cbuf,0,maxChars);
|
||||||
if (n<=0) return null;
|
if (n<=0) return false;
|
||||||
String s = toInternal(new String(cbuf,0,n)); // virtual func on parent
|
String s = toInternal(new String(cbuf,0,n));
|
||||||
return new Token(s,0,n);
|
termAtt.setTermBuffer(s);
|
||||||
};
|
offsetAtt.setOffset(0,n);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
return new TokenStreamInfo(ts, ts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer set by schema for text types to use when indexing fields
|
* Analyzer set by schema for text types to use when indexing fields
|
||||||
* of this type, subclasses can set analyzer themselves or override
|
* of this type, subclasses can set analyzer themselves or override
|
||||||
|
|
|
@ -359,6 +359,11 @@ public final class IndexSchema {
|
||||||
return getAnalyzer(fieldName).tokenStream(fieldName,reader);
|
return getAnalyzer(fieldName).tokenStream(fieldName,reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
|
return getAnalyzer(fieldName).reusableTokenStream(fieldName,reader);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getPositionIncrementGap(String fieldName) {
|
public int getPositionIncrementGap(String fieldName) {
|
||||||
return getAnalyzer(fieldName).getPositionIncrementGap(fieldName);
|
return getAnalyzer(fieldName).getPositionIncrementGap(fieldName);
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.queryParser.ParseException;
|
||||||
import org.apache.lucene.search.*;
|
import org.apache.lucene.search.*;
|
||||||
import org.apache.solr.common.params.SolrParams;
|
import org.apache.solr.common.params.SolrParams;
|
||||||
import org.apache.solr.common.util.NamedList;
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.request.SolrQueryRequest;
|
import org.apache.solr.request.SolrQueryRequest;
|
||||||
import org.apache.solr.schema.FieldType;
|
import org.apache.solr.schema.FieldType;
|
||||||
import org.apache.solr.schema.TextField;
|
import org.apache.solr.schema.TextField;
|
||||||
|
@ -65,7 +66,13 @@ public class FieldQParserPlugin extends QParserPlugin {
|
||||||
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
||||||
// PhraseQuery, or nothing based on the term count
|
// PhraseQuery, or nothing based on the term count
|
||||||
|
|
||||||
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
TokenStream source = null;
|
||||||
|
try {
|
||||||
|
source = analyzer.reusableTokenStream(field, new StringReader(queryText));
|
||||||
|
source.reset();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
|
||||||
|
}
|
||||||
ArrayList<Token> lst = new ArrayList<Token>();
|
ArrayList<Token> lst = new ArrayList<Token>();
|
||||||
Token t;
|
Token t;
|
||||||
int positionCount = 0;
|
int positionCount = 0;
|
||||||
|
|
|
@ -22,8 +22,10 @@ import org.apache.lucene.document.Field;
|
||||||
import org.apache.solr.request.SolrQueryRequest;
|
import org.apache.solr.request.SolrQueryRequest;
|
||||||
import org.apache.solr.schema.IndexSchema;
|
import org.apache.solr.schema.IndexSchema;
|
||||||
import org.apache.solr.util.AbstractSolrTestCase;
|
import org.apache.solr.util.AbstractSolrTestCase;
|
||||||
|
import org.apache.solr.common.util.StrUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
/** Bypass the normal Solr pipeline and just text indexing performance
|
/** Bypass the normal Solr pipeline and just text indexing performance
|
||||||
* starting at the update handler. The same document is indexed repeatedly.
|
* starting at the update handler. The same document is indexed repeatedly.
|
||||||
|
@ -39,6 +41,12 @@ public class TestIndexingPerformance extends AbstractSolrTestCase {
|
||||||
int iter=1000;
|
int iter=1000;
|
||||||
String iterS = System.getProperty("iter");
|
String iterS = System.getProperty("iter");
|
||||||
if (iterS != null) iter=Integer.parseInt(iterS);
|
if (iterS != null) iter=Integer.parseInt(iterS);
|
||||||
|
boolean includeDoc = Boolean.parseBoolean(System.getProperty("includeDoc","true")); // include the time to create the document
|
||||||
|
String doc = System.getProperty("doc");
|
||||||
|
if (doc != null) {
|
||||||
|
StrUtils.splitSmart(doc,",",true);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
SolrQueryRequest req = lrf.makeRequest();
|
SolrQueryRequest req = lrf.makeRequest();
|
||||||
IndexSchema schema = req.getSchema();
|
IndexSchema schema = req.getSchema();
|
||||||
|
@ -53,23 +61,43 @@ public class TestIndexingPerformance extends AbstractSolrTestCase {
|
||||||
,"text","just how fast is this text indexing?"
|
,"text","just how fast is this text indexing?"
|
||||||
};
|
};
|
||||||
|
|
||||||
Document ldoc = new Document();
|
|
||||||
for (int i=0; i<fields.length; i+=2) {
|
/***
|
||||||
String field = fields[i];
|
String[] fields = {
|
||||||
String val = fields[i+1];
|
"a_i","1"
|
||||||
Field f = schema.getField(field).createField(val, 1.0f);
|
,"b_i","2"
|
||||||
ldoc.add(f);
|
,"c_i","3"
|
||||||
}
|
,"d_i","4"
|
||||||
|
,"e_i","5"
|
||||||
|
,"f_i","6"
|
||||||
|
,"g_i","7"
|
||||||
|
,"h_i","8"
|
||||||
|
,"i_i","9"
|
||||||
|
,"j_i","0"
|
||||||
|
,"k_i","0"
|
||||||
|
};
|
||||||
|
***/
|
||||||
|
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
|
||||||
AddUpdateCommand add = new AddUpdateCommand();
|
AddUpdateCommand add = new AddUpdateCommand();
|
||||||
add.allowDups = true;
|
add.allowDups = true;
|
||||||
add.doc = ldoc;
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
for (int i=0; i<iter; i++) {
|
for (int i=0; i<iter; i++) {
|
||||||
|
if (includeDoc || add.doc==null) {
|
||||||
|
add.doc = new Document();
|
||||||
|
for (int j=0; j<fields.length; j+=2) {
|
||||||
|
String field = fields[j];
|
||||||
|
String val = fields[j+1];
|
||||||
|
Field f = schema.getField(field).createField(val, 1.0f);
|
||||||
|
add.doc.add(f);
|
||||||
|
}
|
||||||
|
}
|
||||||
updateHandler.addDoc(add);
|
updateHandler.addDoc(add);
|
||||||
}
|
}
|
||||||
long end = System.currentTimeMillis();
|
long end = System.currentTimeMillis();
|
||||||
|
System.out.println("includeDoc="+includeDoc+" doc="+ Arrays.toString(fields));
|
||||||
System.out.println("iter="+iter +" time=" + (end-start) + " throughput=" + ((long)iter*1000)/(end-start));
|
System.out.println("iter="+iter +" time=" + (end-start) + " throughput=" + ((long)iter*1000)/(end-start));
|
||||||
|
|
||||||
//discard all the changes
|
//discard all the changes
|
||||||
|
|
Loading…
Reference in New Issue