diff --git a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java new file mode 100644 index 00000000000..7a6397a5170 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java @@ -0,0 +1,153 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.analysis.pattern; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +import java.io.IOException; +import java.io.Reader; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * This tokenizer uses regex pattern matching to construct distinct tokens + * for the input stream. It takes two arguments: "pattern" and "group". + *
+ *+ * group=-1 (the default) is equivalent to "split". In this case, the tokens will + * be equivalent to the output from (without empty tokens): + * {@link String#split(java.lang.String)} + *
+ *
+ * Using group >= 0 selects the matching group as the token. For example, if you have:
+ *
+ * pattern = \'([^\']+)\' + * group = 0 + * input = aaa 'bbb' 'ccc' + *+ * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input + * but using group=1, the output would be: bbb and ccc (no ' marks) + * + *
NOTE: This Tokenizer does not output tokens that are of zero length.
+ * + * @see Pattern + */ +public final class PatternTokenizer extends Tokenizer { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + private final StringBuilder str = new StringBuilder(); + private int index; + + private final Pattern pattern; + private final int group; + private final Matcher matcher; + + /** + * creates a new PatternTokenizer returning tokens from group (-1 for split functionality) + */ + public PatternTokenizer(Reader input, Pattern pattern, int group) throws IOException { + super(input); + this.pattern = pattern; + this.group = group; + fillBuffer(str, input); + matcher = pattern.matcher(str); + index = 0; + } + + @Override + public boolean incrementToken() throws IOException { + if (index >= str.length()) return false; + clearAttributes(); + if (group >= 0) { + + // match a specific group + while (matcher.find()) { + index = matcher.start(group); + final int endIndex = matcher.end(group); + if (index == endIndex) continue; + termAtt.setEmpty().append(str, index, endIndex); + offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex)); + return true; + } + + index = Integer.MAX_VALUE; // mark exhausted + return false; + + } else { + + // String.split() functionality + while (matcher.find()) { + if (matcher.start() - index > 0) { + // found a non-zero-length token + termAtt.setEmpty().append(str, index, matcher.start()); + offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start())); + index = matcher.end(); + return true; + } + + index = matcher.end(); + } + + if (str.length() - index == 0) { + index = Integer.MAX_VALUE; // mark exhausted + return false; + } + + termAtt.setEmpty().append(str, index, str.length()); + offsetAtt.setOffset(correctOffset(index), correctOffset(str.length())); + index = Integer.MAX_VALUE; // mark exhausted + return true; + } + } + + @Override + public void end() throws IOException { + final int ofs = correctOffset(str.length()); + offsetAtt.setOffset(ofs, ofs); + } + + @Override + public void reset(Reader input) throws IOException { + super.reset(input); + fillBuffer(str, input); + matcher.reset(str); + index = 0; + } + + // TODO: we should see if we can make this tokenizer work without reading + // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ? + final char[] buffer = new char[8192]; + + private void fillBuffer(StringBuilder sb, Reader input) throws IOException { + int len; + sb.setLength(0); + while ((len = input.read(buffer)) > 0) { + sb.append(buffer, 0, len); + } + } +} diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index 8a05c0ea3f5..19e84ec7e1e 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -268,6 +268,12 @@ public class AnalysisModule extends AbstractModule { if (type == null) { throw new ElasticSearchIllegalArgumentException("Tokenizer [" + tokenizerName + "] must have a type associated with it"); } + + // if it requires settings, and it has none, then don't register it + if (tokenizerSettings.getAsMap().isEmpty() && type.getAnnotation(AnalysisSettingsRequired.class) != null) { + continue; + } + tokenizerBinder.addBinding(tokenizerName).toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, type)).in(Scopes.SINGLETON); } @@ -375,6 +381,7 @@ public class AnalysisModule extends AbstractModule { } @Override public void processTokenizers(TokenizersBindings tokenizersBindings) { + tokenizersBindings.processTokenizer("pattern", PatternTokenizerFactory.class); } @Override public void processAnalyzers(AnalyzersBindings analyzersBindings) { diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PatternTokenizerFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PatternTokenizerFactory.java new file mode 100644 index 00000000000..bbe24dc22b8 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PatternTokenizerFactory.java @@ -0,0 +1,61 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.pattern.PatternTokenizer; +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.ElasticSearchIllegalStateException; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.regex.Regex; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.io.IOException; +import java.io.Reader; +import java.util.regex.Pattern; + +public class PatternTokenizerFactory extends AbstractTokenizerFactory { + + private final Pattern pattern; + private final int group; + + @Inject public PatternTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + + String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/); + if (sPattern == null) { + throw new ElasticSearchIllegalArgumentException("pattern is missing for [" + name + "] tokenizer of type 'pattern'"); + } + + this.pattern = Regex.compile(sPattern, settings.get("flags")); + this.group = settings.getAsInt("group", -1); + } + + @Override public Tokenizer create(Reader reader) { + try { + return new PatternTokenizer(reader, pattern, group); + } catch (IOException e) { + throw new ElasticSearchIllegalStateException("failed to create pattern tokenizer", e); + } + } +} \ No newline at end of file diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java b/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java index 63a4fdd421d..50fd687fbb2 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java @@ -55,6 +55,7 @@ import org.apache.lucene.analysis.nl.DutchAnalyzer; import org.apache.lucene.analysis.nl.DutchStemFilter; import org.apache.lucene.analysis.no.NorwegianAnalyzer; import org.apache.lucene.analysis.path.PathHierarchyTokenizer; +import org.apache.lucene.analysis.pattern.PatternTokenizer; import org.apache.lucene.analysis.pt.PortugueseAnalyzer; import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.ro.RomanianAnalyzer; @@ -70,6 +71,7 @@ import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; import org.apache.lucene.analysis.sv.SwedishAnalyzer; import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; +import org.elasticsearch.ElasticSearchIllegalStateException; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.lucene.Lucene; @@ -79,6 +81,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.index.analysis.*; +import java.io.IOException; import java.io.Reader; import java.util.Map; @@ -259,6 +262,20 @@ public class IndicesAnalysisService extends AbstractComponent { } })); + tokenizerFactories.put("pattern", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "pattern"; + } + + @Override public Tokenizer create(Reader reader) { + try { + return new PatternTokenizer(reader, Regex.compile("\\W+", null), -1); + } catch (IOException e) { + throw new ElasticSearchIllegalStateException("failed to parse default pattern"); + } + } + })); + // Token Filters tokenFilterFactories.put("stop", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() {