diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f39f14a13a0..0beed4284a6 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -137,6 +137,10 @@ Optimizations * LUCENE-5634: IndexWriter reuses TokenStream instances for String and Numeric fields by default. (Uwe Schindler, Shay Banon, Mike McCandless, Robert Muir) +* LUCENE-5638: TokenStream uses a more performant AttributeFactory by default, + that packs the core attributes into one impl, for faster clearAttributes(), + saveState(), and restoreState(). (Uwe Schindler, Robert Muir) + Bug fixes * LUCENE-5600: HttpClientBase did not properly consume a connection if a server diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java index b762178cd6e..4c3fc30368a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java @@ -19,6 +19,7 @@ package org.apache.lucene.analysis.path; import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -68,7 +69,7 @@ public class PathHierarchyTokenizer extends Tokenizer { } public PathHierarchyTokenizer(int bufferSize, char delimiter, char replacement, int skip) { - this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, bufferSize, delimiter, replacement, skip); + this(Token.TOKEN_ATTRIBUTE_FACTORY, bufferSize, delimiter, replacement, skip); } public PathHierarchyTokenizer diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java index 71db68d4633..0f58f7f3dc8 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java @@ -21,6 +21,7 @@ import java.io.Reader; import java.util.ArrayList; import java.util.List; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -81,7 +82,7 @@ public class ReversePathHierarchyTokenizer extends Tokenizer { } public ReversePathHierarchyTokenizer( int bufferSize, char delimiter, char replacement, int skip) { - this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, bufferSize, delimiter, replacement, skip); + this(Token.TOKEN_ATTRIBUTE_FACTORY, bufferSize, delimiter, replacement, skip); } public ReversePathHierarchyTokenizer (AttributeFactory factory, int bufferSize, char delimiter, char replacement, int skip) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java index 761535e6e4b..6dcf0720c88 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java @@ -21,6 +21,8 @@ import java.io.IOException; import java.io.Reader; import java.util.regex.Matcher; import java.util.regex.Pattern; + +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -65,7 +67,7 @@ public final class PatternTokenizer extends Tokenizer { /** creates a new PatternTokenizer returning tokens from group (-1 for split functionality) */ public PatternTokenizer(Pattern pattern, int group) { - this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, pattern, group); + this(Token.TOKEN_ATTRIBUTE_FACTORY, pattern, group); } /** creates a new PatternTokenizer returning tokens from group (-1 for split functionality) */ diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java index e2b40842657..b916fbdb44b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java @@ -20,6 +20,7 @@ package org.apache.lucene.analysis.th; import java.text.BreakIterator; import java.util.Locale; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.util.CharArrayIterator; @@ -59,7 +60,7 @@ public class ThaiTokenizer extends SegmentingTokenizerBase { /** Creates a new ThaiTokenizer */ public ThaiTokenizer() { - this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); + this(Token.TOKEN_ATTRIBUTE_FACTORY); } /** Creates a new ThaiTokenizer, supplying the AttributeFactory */ diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java index 7bc4bb41759..00c3e9fd2b6 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java @@ -22,6 +22,7 @@ import java.io.Reader; import java.text.BreakIterator; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -62,7 +63,7 @@ public abstract class SegmentingTokenizerBase extends Tokenizer { * be provided to this constructor. */ public SegmentingTokenizerBase(BreakIterator iterator) { - this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, iterator); + this(Token.TOKEN_ATTRIBUTE_FACTORY, iterator); } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/TokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/TokenizerFactory.java index 3436930bfc4..d86092af069 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/TokenizerFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/TokenizerFactory.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.util; * limitations under the License. */ +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util.AttributeSource.AttributeFactory; @@ -72,7 +73,7 @@ public abstract class TokenizerFactory extends AbstractAnalysisFactory { /** Creates a TokenStream of the specified input using the default attribute factory. */ public final Tokenizer create() { - return create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); + return create(Token.TOKEN_ATTRIBUTE_FACTORY); } /** Creates a TokenStream of the specified input using the given AttributeFactory */ diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java index a4c79c2a5a6..1db52ad6f1d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationAttributeFactory.java @@ -19,6 +19,7 @@ package org.apache.lucene.collation; import java.text.Collator; +import org.apache.lucene.analysis.Token; import org.apache.lucene.collation.tokenattributes.CollatedTermAttributeImpl; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; @@ -74,12 +75,12 @@ public class CollationAttributeFactory extends AttributeSource.AttributeFactory /** * Create a CollationAttributeFactory, using - * {@link org.apache.lucene.util.AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY} as the + * {@link org.apache.lucene.analysis.Token#TOKEN_ATTRIBUTE_FACTORY} as the * factory for all other attributes. * @param collator CollationKey generator */ public CollationAttributeFactory(Collator collator) { - this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator); + this(Token.TOKEN_ATTRIBUTE_FACTORY, collator); } /** diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java index 9e4ead4edfd..25005fea73c 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java @@ -20,6 +20,7 @@ package org.apache.lucene.analysis.icu.segmentation; import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -79,7 +80,7 @@ public final class ICUTokenizer extends Tokenizer { * @param config Tailored BreakIterator configuration */ public ICUTokenizer(ICUTokenizerConfig config) { - this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, config); + this(Token.TOKEN_ATTRIBUTE_FACTORY, config); } /** diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java index a1a6a59f19f..42fc1c6419c 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java @@ -17,6 +17,7 @@ package org.apache.lucene.collation; * limitations under the License. */ +import org.apache.lucene.analysis.Token; import org.apache.lucene.collation.tokenattributes.ICUCollatedTermAttributeImpl; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; @@ -68,12 +69,12 @@ public class ICUCollationAttributeFactory extends AttributeSource.AttributeFacto /** * Create an ICUCollationAttributeFactory, using - * {@link org.apache.lucene.util.AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY} as the + * {@link org.apache.lucene.analysis.Token#TOKEN_ATTRIBUTE_FACTORY} as the * factory for all other attributes. * @param collator CollationKey generator */ public ICUCollationAttributeFactory(Collator collator) { - this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator); + this(Token.TOKEN_ATTRIBUTE_FACTORY, collator); } /** diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java index ede01ccdae0..4fc6c91c06e 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java @@ -195,7 +195,7 @@ public final class JapaneseTokenizer extends Tokenizer { * @param mode tokenization mode. */ public JapaneseTokenizer(UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { - this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, userDictionary, discardPunctuation, mode); + this(org.apache.lucene.analysis.Token.TOKEN_ATTRIBUTE_FACTORY, userDictionary, discardPunctuation, mode); } /** diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java index 44878e159f9..0e068c8219e 100644 --- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java +++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.java @@ -22,6 +22,7 @@ import java.text.BreakIterator; import java.util.Iterator; import java.util.Locale; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.cn.smart.hhmm.SegToken; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -47,7 +48,7 @@ public class HMMChineseTokenizer extends SegmentingTokenizerBase { /** Creates a new HMMChineseTokenizer */ public HMMChineseTokenizer() { - this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); + this(Token.TOKEN_ATTRIBUTE_FACTORY); } /** Creates a new HMMChineseTokenizer, supplying the AttributeFactory */ diff --git a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java index 330ba8338e8..7715fe38c57 100644 --- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java +++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.uima; * limitations under the License. */ +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -43,7 +44,7 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer { private int finalOffset = 0; public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Map configurationParameters) { - this(descriptorPath, tokenType, configurationParameters, AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); + this(descriptorPath, tokenType, configurationParameters, Token.TOKEN_ATTRIBUTE_FACTORY); } public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Map configurationParameters, diff --git a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java index a7bbfb04409..045813d42e3 100644 --- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java +++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.uima; * limitations under the License. */ +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -53,7 +54,7 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer { private int finalOffset = 0; public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Map configurationParameters) { - this(descriptorPath, tokenType, typeAttributeFeaturePath, configurationParameters, AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); + this(descriptorPath, tokenType, typeAttributeFeaturePath, configurationParameters, Token.TOKEN_ATTRIBUTE_FACTORY); } public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, diff --git a/lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java b/lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java index 0d10d6c1c57..ef137c29a91 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java @@ -89,7 +89,7 @@ public abstract class TokenStream extends AttributeSource implements Closeable { * A TokenStream using the default attribute factory. */ protected TokenStream() { - super(); + super(Token.TOKEN_ATTRIBUTE_FACTORY); assert assertFinal(); } diff --git a/solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java b/solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java index 6d85e9a31ee..d749bc02b4d 100644 --- a/solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java +++ b/solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java @@ -255,6 +255,8 @@ public class PreAnalyzedField extends FieldType { private PreAnalyzedParser parser; public PreAnalyzedTokenizer(PreAnalyzedParser parser) { + // we don't pack attributes: since we are used for (de)serialization and dont want bloat. + super(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); this.parser = parser; }