From bb76127225655c084280d606f2fc03bee358f792 Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Sun, 16 Aug 2009 17:28:58 +0000 Subject: [PATCH] SOLR-1353: Implement and use reusable token streams for analysis git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@804726 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + .../analysis/CapitalizationFilterFactory.java | 69 +++++++++---------- .../solr/analysis/DoubleMetaphoneFilter.java | 3 +- .../analysis/EnglishPorterFilterFactory.java | 48 ++----------- .../apache/solr/analysis/KeepWordFilter.java | 17 ++--- .../solr/analysis/PatternReplaceFilter.java | 31 +++++---- .../analysis/PatternTokenizerFactory.java | 29 ++++++++ .../apache/solr/analysis/PhoneticFilter.java | 3 +- .../analysis/SnowballPorterFilterFactory.java | 54 ++++++--------- .../apache/solr/analysis/SolrAnalyzer.java | 46 ++++++++++++- .../apache/solr/analysis/SynonymFilter.java | 5 ++ .../apache/solr/analysis/TokenizerChain.java | 21 ++---- .../solr/analysis/TrieTokenizerFactory.java | 69 +++++++++++++++---- .../org/apache/solr/analysis/TrimFilter.java | 39 +++++------ .../solr/analysis/WordDelimiterFilter.java | 6 ++ .../handler/AnalysisRequestHandlerBase.java | 10 ++- .../component/QueryElevationComponent.java | 4 +- .../component/SpellCheckComponent.java | 3 +- .../highlight/DefaultSolrHighlighter.java | 4 +- .../org/apache/solr/schema/BoolField.java | 48 ++++++++----- .../org/apache/solr/schema/FieldType.java | 53 +++++--------- .../org/apache/solr/schema/IndexSchema.java | 5 ++ .../solr/search/FieldQParserPlugin.java | 9 ++- .../solr/update/TestIndexingPerformance.java | 46 ++++++++++--- 24 files changed, 377 insertions(+), 248 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index c5db94812d2..eefd8fe2407 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -320,6 +320,9 @@ Optimizations 15. SOLR-1150: Load Documents for Highlighting one at a time rather than all at once to avoid OOM with many large Documents. (Siddharth Gargate via Mark Miller) +16. SOLR-1353: Implement and use reusable token streams for analysis. (yonik) + + Bug Fixes ---------------------- 1. SOLR-774: Fixed logging level display (Sean Timm via Otis Gospodnetic) diff --git a/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java b/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java index 4589e4e8ade..693ddec734b 100644 --- a/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java +++ b/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java @@ -17,10 +17,8 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; import java.util.ArrayList; @@ -190,52 +188,53 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory { * This is package protected since it is not useful without the Factory */ class CapitalizationFilter extends TokenFilter { - protected final CapitalizationFilterFactory factory; + private final CapitalizationFilterFactory factory; + private final TermAttribute termAtt; public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) { super(in); this.factory = factory; + this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); } @Override - public Token next(Token token) throws IOException { - Token t = input.next(token); - if (t != null) { + public boolean incrementToken() throws IOException { + if (!input.incrementToken()) return false; - char[] termBuffer = t.termBuffer(); - int termBufferLength = t.termLength(); - char[] backup = null; - if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) { - //make a backup in case we exceed the word count - System.arraycopy(termBuffer, 0, backup, 0, termBufferLength); - } - if (termBufferLength < factory.maxTokenLength) { - int wordCount = 0; + char[] termBuffer = termAtt.termBuffer(); + int termBufferLength = termAtt.termLength(); + char[] backup = null; + if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) { + //make a backup in case we exceed the word count + System.arraycopy(termBuffer, 0, backup, 0, termBufferLength); + } + if (termBufferLength < factory.maxTokenLength) { + int wordCount = 0; - int lastWordStart = 0; - for (int i = 0; i < termBufferLength; i++) { - char c = termBuffer[i]; - if (c <= ' ' || c == '.') { - int len = i - lastWordStart; - if (len > 0) { - factory.processWord(termBuffer, lastWordStart, len, wordCount++); - lastWordStart = i + 1; - i++; - } + int lastWordStart = 0; + for (int i = 0; i < termBufferLength; i++) { + char c = termBuffer[i]; + if (c <= ' ' || c == '.') { + int len = i - lastWordStart; + if (len > 0) { + factory.processWord(termBuffer, lastWordStart, len, wordCount++); + lastWordStart = i + 1; + i++; } } + } - // process the last word - if (lastWordStart < termBufferLength) { - factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); - } + // process the last word + if (lastWordStart < termBufferLength) { + factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); + } - if (wordCount > factory.maxWordCount) { - t.setTermBuffer(backup, 0, termBufferLength); - } + if (wordCount > factory.maxWordCount) { + termAtt.setTermBuffer(backup, 0, termBufferLength); } } - return t; + + return true; } } diff --git a/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java b/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java index 3d8a1c686ab..450dc1f6eb9 100644 --- a/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java +++ b/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java @@ -50,7 +50,8 @@ public class DoubleMetaphoneFilter extends TokenFilter { for(;;) { if (!remainingTokens.isEmpty()) { - clearAttributes(); restoreState(remainingTokens.removeFirst()); + // clearAttributes(); // not currently necessary + restoreState(remainingTokens.removeFirst()); return true; } diff --git a/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java b/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java index 09f8233da8d..4dc5a9c8da5 100644 --- a/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java +++ b/src/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; +import org.tartarus.snowball.SnowballProgram; import java.io.IOException; import java.io.File; @@ -75,50 +76,9 @@ public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implement * English Porter2 filter that doesn't use reflection to * adapt lucene to the snowball stemmer code. */ -class EnglishPorterFilter extends TokenFilter { - private final CharArraySet protWords; - private org.tartarus.snowball.ext.EnglishStemmer stemmer; - +@Deprecated +class EnglishPorterFilter extends SnowballPorterFilter { public EnglishPorterFilter(TokenStream source, CharArraySet protWords) { - super(source); - this.protWords = protWords; - stemmer = new org.tartarus.snowball.ext.EnglishStemmer(); - } - - - /** - * the original code from lucene sandbox - * public final Token next() throws IOException { - * Token token = input.next(); - * if (token == null) - * return null; - * stemmer.setCurrent(token.termText()); - * try { - * stemMethod.invoke(stemmer, EMPTY_ARGS); - * } catch (Exception e) { - * throw new RuntimeException(e.toString()); - * } - * return new Token(stemmer.getCurrent(), - * token.startOffset(), token.endOffset(), token.type()); - * } - */ - - @Override - public Token next(Token token) throws IOException { - Token result = input.next(token); - if (result != null) { - char[] termBuffer = result.termBuffer(); - int len = result.termLength(); - // if protected, don't stem. use this to avoid stemming collisions. - if (protWords != null && protWords.contains(termBuffer, 0, len)) { - return result; - } - stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array - stemmer.stem(); - String newstr = stemmer.getCurrent(); - result.setTermBuffer(newstr.toCharArray(), 0, newstr.length()); - } - return result; + super(source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords); } } - diff --git a/src/java/org/apache/solr/analysis/KeepWordFilter.java b/src/java/org/apache/solr/analysis/KeepWordFilter.java index cd8ad472d29..74e9bb744bd 100644 --- a/src/java/org/apache/solr/analysis/KeepWordFilter.java +++ b/src/java/org/apache/solr/analysis/KeepWordFilter.java @@ -21,6 +21,8 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.tartarus.snowball.SnowballProgram; import java.io.IOException; import java.util.Set; @@ -33,21 +35,20 @@ import java.util.Set; * @since solr 1.3 */ public final class KeepWordFilter extends TokenFilter { - final CharArraySet words; + private final CharArraySet words; + private final TermAttribute termAtt; - public KeepWordFilter(TokenStream in, Set words, boolean ignoreCase ) { super(in); this.words = new CharArraySet(words, ignoreCase); + this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); } @Override - public final Token next(Token in) throws IOException { - for (Token token=input.next(in); token!=null; token=input.next(token)) { - if( words.contains( token.termBuffer(), 0, token.termLength() ) ) { - return token; - } + public boolean incrementToken() throws IOException { + while (input.incrementToken()) { + if (words.contains(termAtt.termBuffer(), 0, termAtt.termLength())) return true; } - return null; + return false; } } diff --git a/src/java/org/apache/solr/analysis/PatternReplaceFilter.java b/src/java/org/apache/solr/analysis/PatternReplaceFilter.java index 3598018c616..dc4029e2a67 100644 --- a/src/java/org/apache/solr/analysis/PatternReplaceFilter.java +++ b/src/java/org/apache/solr/analysis/PatternReplaceFilter.java @@ -20,9 +20,12 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.util.regex.Pattern; import java.util.regex.Matcher; +import java.util.Set; import java.io.IOException; import java.nio.CharBuffer; @@ -40,10 +43,10 @@ import java.nio.CharBuffer; * @see Pattern */ public final class PatternReplaceFilter extends TokenFilter { - Pattern p; - String replacement; - boolean all = true; - + private final Pattern p; + private final String replacement; + private final boolean all; + private final TermAttribute termAtt; /** * Constructs an instance to replace either the first, or all occurances * @@ -63,21 +66,23 @@ public final class PatternReplaceFilter extends TokenFilter { this.p=p; this.replacement = (null == replacement) ? "" : replacement; this.all=all; + this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); } - - public final Token next(Token in) throws IOException { - Token t = input.next(in); - if (t == null) - return null; - CharSequence text = CharBuffer.wrap(t.termBuffer(), 0, t.termLength()); + + @Override + public boolean incrementToken() throws IOException { + if (!input.incrementToken()) return false; + + CharSequence text = CharBuffer.wrap(termAtt.termBuffer(), 0, termAtt.termLength()); Matcher m = p.matcher(text); + if (all) { - t.setTermText(m.replaceAll(replacement)); + termAtt.setTermBuffer(m.replaceAll(replacement)); } else { - t.setTermText(m.replaceFirst(replacement)); + termAtt.setTermBuffer(m.replaceFirst(replacement)); } - return t; + return true; } } diff --git a/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java b/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java index 6b575fd0b26..f9faed31a9e 100644 --- a/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java +++ b/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java @@ -20,6 +20,10 @@ package org.apache.solr.analysis; import org.apache.commons.io.IOUtils; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.solr.common.SolrException; import org.apache.solr.core.SolrConfig; @@ -111,6 +115,31 @@ public class PatternTokenizerFactory extends BaseTokenizerFactory final Iterator iter = tokens.iterator(); return new TokenStream() { + @Override + public boolean incrementToken() throws IOException { + return super.incrementToken(); + } + + @Override + public void end() throws IOException { + super.end(); + } + + @Override + public Token next(Token reusableToken) throws IOException { + return super.next(reusableToken); + } + + @Override + public void reset() throws IOException { + super.reset(); + } + + @Override + public void close() throws IOException { + super.close(); + } + @Override public Token next() throws IOException { if( iter.hasNext() ) { diff --git a/src/java/org/apache/solr/analysis/PhoneticFilter.java b/src/java/org/apache/solr/analysis/PhoneticFilter.java index 5892da97f8c..c097cfa7a21 100644 --- a/src/java/org/apache/solr/analysis/PhoneticFilter.java +++ b/src/java/org/apache/solr/analysis/PhoneticFilter.java @@ -54,7 +54,8 @@ public class PhoneticFilter extends TokenFilter @Override public boolean incrementToken() throws IOException { if( save != null ) { - clearAttributes(); restoreState(save); + // clearAttributes(); // not currently necessary + restoreState(save); save = null; return true; } diff --git a/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java b/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java index f7c8ec3509e..86c7ef8e417 100644 --- a/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java +++ b/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.util.StrUtils; @@ -97,50 +98,35 @@ public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implemen } } + class SnowballPorterFilter extends TokenFilter { private final CharArraySet protWords; - private SnowballProgram stemmer; + private final SnowballProgram stemmer; + private final TermAttribute termAtt; public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) { super(source); this.protWords = protWords; this.stemmer = stemmer; + this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); } - - /** - * the original code from lucene sandbox - * public final Token next() throws IOException { - * Token token = input.next(); - * if (token == null) - * return null; - * stemmer.setCurrent(token.termText()); - * try { - * stemMethod.invoke(stemmer, EMPTY_ARGS); - * } catch (Exception e) { - * throw new RuntimeException(e.toString()); - * } - * return new Token(stemmer.getCurrent(), - * token.startOffset(), token.endOffset(), token.type()); - * } - */ - @Override - public Token next(Token token) throws IOException { - Token result = input.next(token); - if (result != null) { - char[] termBuffer = result.termBuffer(); - int len = result.termLength(); - // if protected, don't stem. use this to avoid stemming collisions. - if (protWords != null && protWords.contains(termBuffer, 0, len)) { - return result; - } - stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array - stemmer.stem(); - String newstr = stemmer.getCurrent(); - result.setTermBuffer(newstr.toCharArray(), 0, newstr.length()); + public boolean incrementToken() throws IOException { + if (!input.incrementToken()) return false; + + char[] termBuffer = termAtt.termBuffer(); + int len = termAtt.termLength(); + // if protected, don't stem. use this to avoid stemming collisions. + if (protWords != null && protWords.contains(termBuffer, 0, len)) { + return true; } - return result; + + stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array + stemmer.stem(); + String newstr = stemmer.getCurrent(); + termAtt.setTermBuffer(newstr.toCharArray(), 0, newstr.length()); + + return true; } } - diff --git a/src/java/org/apache/solr/analysis/SolrAnalyzer.java b/src/java/org/apache/solr/analysis/SolrAnalyzer.java index 3348fdddeb2..6244e0c82f6 100644 --- a/src/java/org/apache/solr/analysis/SolrAnalyzer.java +++ b/src/java/org/apache/solr/analysis/SolrAnalyzer.java @@ -17,7 +17,10 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.*; + +import java.io.Reader; +import java.io.IOException; /** * @version $Id$ @@ -32,4 +35,45 @@ public abstract class SolrAnalyzer extends Analyzer { public int getPositionIncrementGap(String fieldName) { return posIncGap; } + + /** wrap the reader in a CharStream, if appropriate */ + public Reader charStream(Reader reader){ + return reader; + } + + @Override + public TokenStream tokenStream(String fieldName, Reader reader) { + return getStream(fieldName, reader).getTokenStream(); + } + + public static class TokenStreamInfo { + private final Tokenizer tokenizer; + private final TokenStream tokenStream; + public TokenStreamInfo(Tokenizer tokenizer, TokenStream tokenStream) { + this.tokenizer = tokenizer; + this.tokenStream = tokenStream; + } + public Tokenizer getTokenizer() { return tokenizer; } + public TokenStream getTokenStream() { return tokenStream; } + } + + + public abstract TokenStreamInfo getStream(String fieldName, Reader reader); + + @Override + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + // if (true) return tokenStream(fieldName, reader); + TokenStreamInfo tsi = (TokenStreamInfo)getPreviousTokenStream(); + if (tsi != null) { + tsi.getTokenizer().reset(charStream(reader)); + // the consumer will currently call reset() on the TokenStream to hit all the filters. + // this isn't necessarily guaranteed by the APIs... but is currently done + // by lucene indexing in DocInverterPerField, and in the QueryParser + return tsi.getTokenStream(); + } else { + tsi = getStream(fieldName, reader); + setPreviousTokenStream(tsi); + return tsi.getTokenStream(); + } + } } diff --git a/src/java/org/apache/solr/analysis/SynonymFilter.java b/src/java/org/apache/solr/analysis/SynonymFilter.java index 1191aa767d7..f02bd8520b4 100644 --- a/src/java/org/apache/solr/analysis/SynonymFilter.java +++ b/src/java/org/apache/solr/analysis/SynonymFilter.java @@ -205,4 +205,9 @@ public class SynonymFilter extends TokenFilter { return result; } + @Override + public void reset() throws IOException { + input.reset(); + replacement = null; + } } diff --git a/src/java/org/apache/solr/analysis/TokenizerChain.java b/src/java/org/apache/solr/analysis/TokenizerChain.java index 4ea6c371735..eff4e83e4fe 100644 --- a/src/java/org/apache/solr/analysis/TokenizerChain.java +++ b/src/java/org/apache/solr/analysis/TokenizerChain.java @@ -20,8 +20,10 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.CharReader; +import org.apache.lucene.analysis.Tokenizer; import java.io.Reader; +import java.io.IOException; /** * @version $Id$ @@ -50,23 +52,14 @@ public class TokenizerChain extends SolrAnalyzer { public TokenizerFactory getTokenizerFactory() { return tokenizer; } public TokenFilterFactory[] getTokenFilterFactories() { return filters; } - public Reader charStream(Reader reader){ - if( charFilters != null && charFilters.length > 0 ){ - CharStream cs = CharReader.get( reader ); - for (int i=0; i 0 || end < len) { if (start < end) { - t.setTermBuffer(t.termBuffer(), start, (end - start)); + termAtt.setTermBuffer(termBuffer, start, (end - start)); } else { - t.setTermLength(0); + termAtt.setTermLength(0); } if (updateOffsets) { - t.setStartOffset(t.startOffset() + start); - if (start < end) { - t.setEndOffset(t.endOffset() - endOff); - } //else if end is less than, start, then the term length is 0, so, no need to bother w/ the end offset + int newStart = offsetAtt.startOffset()+start; + int newEnd = offsetAtt.endOffset() - (start> namedList = new SimpleOrderedMap>(); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context)); return namedList; diff --git a/src/java/org/apache/solr/handler/component/QueryElevationComponent.java b/src/java/org/apache/solr/handler/component/QueryElevationComponent.java index c3203d4cd14..e8a4aa495a0 100644 --- a/src/java/org/apache/solr/handler/component/QueryElevationComponent.java +++ b/src/java/org/apache/solr/handler/component/QueryElevationComponent.java @@ -294,7 +294,9 @@ public class QueryElevationComponent extends SearchComponent implements SolrCore return query; } StringBuilder norm = new StringBuilder(); - TokenStream tokens = analyzer.tokenStream( null, new StringReader( query ) ); + TokenStream tokens = analyzer.reusableTokenStream( "", new StringReader( query ) ); + tokens.reset(); + Token token = tokens.next(); while( token != null ) { norm.append( new String(token.termBuffer(), 0, token.termLength()) ); diff --git a/src/java/org/apache/solr/handler/component/SpellCheckComponent.java b/src/java/org/apache/solr/handler/component/SpellCheckComponent.java index 0aa59c888fd..8a03a1b78f0 100644 --- a/src/java/org/apache/solr/handler/component/SpellCheckComponent.java +++ b/src/java/org/apache/solr/handler/component/SpellCheckComponent.java @@ -160,7 +160,8 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar private Collection getTokens(String q, Analyzer analyzer) throws IOException { Collection result = new ArrayList(); Token token = null; - TokenStream ts = analyzer.tokenStream("", new StringReader(q)); + TokenStream ts = analyzer.reusableTokenStream("", new StringReader(q)); + ts.reset(); while ((token = ts.next()) != null){ result.add(token); } diff --git a/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java b/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java index e353b1c5d1e..de60c2c97f1 100644 --- a/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java +++ b/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java @@ -286,7 +286,9 @@ public class DefaultSolrHighlighter extends SolrHighlighter } catch (IllegalArgumentException e) { // fall back to anaylzer - tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[j])), 10); + TokenStream ts = schema.getAnalyzer().reusableTokenStream(fieldName, new StringReader(docTexts[j])); + ts.reset(); + tstream = new TokenOrderingFilter(ts, 10); } Highlighter highlighter; diff --git a/src/java/org/apache/solr/schema/BoolField.java b/src/java/org/apache/solr/schema/BoolField.java index 605bcc94f15..834d7a0220c 100644 --- a/src/java/org/apache/solr/schema/BoolField.java +++ b/src/java/org/apache/solr/schema/BoolField.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Fieldable; import org.apache.solr.request.XMLWriter; import org.apache.solr.request.TextResponseWriter; @@ -48,28 +49,43 @@ public class BoolField extends FieldType { } // avoid instantiating every time... - protected final static Token TRUE_TOKEN = new Token("T",0,1); - protected final static Token FALSE_TOKEN = new Token("F",0,1); + protected final static char[] TRUE_TOKEN = {'T'}; + protected final static char[] FALSE_TOKEN = {'F'}; //////////////////////////////////////////////////////////////////////// // TODO: look into creating my own queryParser that can more efficiently // handle single valued non-text fields (int,bool,etc) if needed. - protected final static Analyzer boolAnalyzer = new SolrAnalyzer() { - public TokenStream tokenStream(String fieldName, Reader reader) { - return new Tokenizer(reader) { - boolean done=false; - public Token next() throws IOException { - if (done) return null; - done=true; - int ch = input.read(); - if (ch==-1) return null; - return (ch=='t' || ch=='T' || ch=='1') ? TRUE_TOKEN : FALSE_TOKEN; - } - }; - } - }; + public TokenStreamInfo getStream(String fieldName, Reader reader) { + Tokenizer tokenizer = new Tokenizer(reader) { + final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + boolean done = false; + + @Override + public void reset(Reader input) throws IOException { + done = false; + super.reset(input); + } + + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + if (done) return false; + done = true; + int ch = input.read(); + if (ch==-1) return false; + termAtt.setTermBuffer( + ((ch=='t' || ch=='T' || ch=='1') ? TRUE_TOKEN : FALSE_TOKEN) + ,0,1); + return true; + } + }; + + return new TokenStreamInfo(tokenizer, tokenizer); + } + }; + public Analyzer getAnalyzer() { return boolAnalyzer; diff --git a/src/java/org/apache/solr/schema/FieldType.java b/src/java/org/apache/solr/schema/FieldType.java index 9c9993ae388..307f5487752 100644 --- a/src/java/org/apache/solr/schema/FieldType.java +++ b/src/java/org/apache/solr/schema/FieldType.java @@ -23,6 +23,8 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.search.SortField; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermRangeQuery; @@ -286,55 +288,38 @@ public abstract class FieldType extends FieldProperties { return toInternal(val); } - /********* - // default analyzer for non-text fields. - // Only reads 80 bytes, but that should be plenty for a single value. - public Analyzer getAnalyzer() { - if (analyzer != null) return analyzer; - - // the default analyzer... - return new Analyzer() { - public TokenStream tokenStream(String fieldName, Reader reader) { - return new Tokenizer(reader) { - final char[] cbuf = new char[80]; - public Token next() throws IOException { - int n = input.read(cbuf,0,80); - if (n<=0) return null; - String s = toInternal(new String(cbuf,0,n)); - return new Token(s,0,n); - }; - }; - } - }; - } - **********/ - - /** * Default analyzer for types that only produce 1 verbatim token... * A maximum size of chars to be read must be specified */ - protected final class DefaultAnalyzer extends SolrAnalyzer { + protected class DefaultAnalyzer extends SolrAnalyzer { final int maxChars; DefaultAnalyzer(int maxChars) { this.maxChars=maxChars; } - public TokenStream tokenStream(String fieldName, Reader reader) { - return new Tokenizer(reader) { - char[] cbuf = new char[maxChars]; - public Token next() throws IOException { + public TokenStreamInfo getStream(String fieldName, Reader reader) { + Tokenizer ts = new Tokenizer(reader) { + final char[] cbuf = new char[maxChars]; + final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); int n = input.read(cbuf,0,maxChars); - if (n<=0) return null; - String s = toInternal(new String(cbuf,0,n)); // virtual func on parent - return new Token(s,0,n); - }; + if (n<=0) return false; + String s = toInternal(new String(cbuf,0,n)); + termAtt.setTermBuffer(s); + offsetAtt.setOffset(0,n); + return true; + } }; + + return new TokenStreamInfo(ts, ts); } } - /** * Analyzer set by schema for text types to use when indexing fields * of this type, subclasses can set analyzer themselves or override diff --git a/src/java/org/apache/solr/schema/IndexSchema.java b/src/java/org/apache/solr/schema/IndexSchema.java index 7b076dde173..c04eaa0de9d 100644 --- a/src/java/org/apache/solr/schema/IndexSchema.java +++ b/src/java/org/apache/solr/schema/IndexSchema.java @@ -359,6 +359,11 @@ public final class IndexSchema { return getAnalyzer(fieldName).tokenStream(fieldName,reader); } + @Override + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + return getAnalyzer(fieldName).reusableTokenStream(fieldName,reader); + } + @Override public int getPositionIncrementGap(String fieldName) { return getAnalyzer(fieldName).getPositionIncrementGap(fieldName); diff --git a/src/java/org/apache/solr/search/FieldQParserPlugin.java b/src/java/org/apache/solr/search/FieldQParserPlugin.java index 1dca87a2fbd..ecbe658cf4e 100644 --- a/src/java/org/apache/solr/search/FieldQParserPlugin.java +++ b/src/java/org/apache/solr/search/FieldQParserPlugin.java @@ -24,6 +24,7 @@ import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.*; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.SolrException; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.TextField; @@ -65,7 +66,13 @@ public class FieldQParserPlugin extends QParserPlugin { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count - TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); + TokenStream source = null; + try { + source = analyzer.reusableTokenStream(field, new StringReader(queryText)); + source.reset(); + } catch (IOException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); + } ArrayList lst = new ArrayList(); Token t; int positionCount = 0; diff --git a/src/test/org/apache/solr/update/TestIndexingPerformance.java b/src/test/org/apache/solr/update/TestIndexingPerformance.java index 48c7f102881..32f73ac538d 100755 --- a/src/test/org/apache/solr/update/TestIndexingPerformance.java +++ b/src/test/org/apache/solr/update/TestIndexingPerformance.java @@ -22,8 +22,10 @@ import org.apache.lucene.document.Field; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.schema.IndexSchema; import org.apache.solr.util.AbstractSolrTestCase; +import org.apache.solr.common.util.StrUtils; import java.io.IOException; +import java.util.Arrays; /** Bypass the normal Solr pipeline and just text indexing performance * starting at the update handler. The same document is indexed repeatedly. @@ -39,6 +41,12 @@ public class TestIndexingPerformance extends AbstractSolrTestCase { int iter=1000; String iterS = System.getProperty("iter"); if (iterS != null) iter=Integer.parseInt(iterS); + boolean includeDoc = Boolean.parseBoolean(System.getProperty("includeDoc","true")); // include the time to create the document + String doc = System.getProperty("doc"); + if (doc != null) { + StrUtils.splitSmart(doc,",",true); + } + SolrQueryRequest req = lrf.makeRequest(); IndexSchema schema = req.getSchema(); @@ -53,23 +61,43 @@ public class TestIndexingPerformance extends AbstractSolrTestCase { ,"text","just how fast is this text indexing?" }; - Document ldoc = new Document(); - for (int i=0; i