From 16e137ebbc6256d8461297030504b893c9a98370 Mon Sep 17 00:00:00 2001 From: Clinton Gormley Date: Tue, 16 Jul 2013 18:08:12 +0200 Subject: [PATCH] Added the "pattern_capture" token filter from Lucene 4.4 The XPatternCaptureGroupTokenFilter.java file can be removed once we upgrade to Lucene 4.4. This change required the addition of the commaDelimited flag to getAsArray() to disable parsing strings as comma-delimited values. Closes #3340 --- .../XPatternCaptureGroupTokenFilter.java | 200 ++++++++++++++++++ .../common/settings/ImmutableSettings.java | 19 +- .../common/settings/Settings.java | 17 +- .../index/analysis/AnalysisModule.java | 1 + ...PatternCaptureGroupTokenFilterFactory.java | 63 ++++++ .../PatternCaptureTokenFilterTests.java | 72 +++++++ .../unit/index/analysis/pattern_capture.json | 46 ++++ 7 files changed, 412 insertions(+), 6 deletions(-) create mode 100644 src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java create mode 100644 src/test/java/org/elasticsearch/test/unit/index/analysis/PatternCaptureTokenFilterTests.java create mode 100644 src/test/java/org/elasticsearch/test/unit/index/analysis/pattern_capture.json diff --git a/src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java b/src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java new file mode 100644 index 00000000000..6d6dea8d032 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java @@ -0,0 +1,200 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.CharsRef; + +/** + * CaptureGroup uses Java regexes to emit multiple tokens - one for each capture + * group in one or more patterns. + * + *

+ * For example, a pattern like: + *

+ * + *

+ * "(https?://([a-zA-Z\-_0-9.]+))" + *

+ * + *

+ * when matched against the string "http://www.foo.com/index" would return the + * tokens "https://www.foo.com" and "www.foo.com". + *

+ * + *

+ * If none of the patterns match, or if preserveOriginal is true, the original + * token will be preserved. + *

+ *

+ * Each pattern is matched as often as it can be, so the pattern + * "(...)", when matched against "abcdefghi" would + * produce ["abc","def","ghi"] + *

+ *

+ * A camelCaseFilter could be written as: + *

+ *

+ * + * "([A-Z]{2,})",
+ * "(?<![A-Z])([A-Z][a-z]+)",
+ * "(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)",
+ * "([0-9]+)" + *
+ *

+ *

+ * plus if {@link #preserveOriginal} is true, it would also return + * "camelCaseFilter + *

+ */ +public final class XPatternCaptureGroupTokenFilter extends TokenFilter { + + private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class); + private State state; + private final Matcher[] matchers; + private final CharsRef spare = new CharsRef(); + private final int[] groupCounts; + private final boolean preserveOriginal; + private int[] currentGroup; + private int currentMatcher; + + /** + * @param input + * the input {@link TokenStream} + * @param preserveOriginal + * set to true to return the original token even if one of the + * patterns matches + * @param patterns + * an array of {@link Pattern} objects to match against each token + */ + + public XPatternCaptureGroupTokenFilter(TokenStream input, + boolean preserveOriginal, Pattern... patterns) { + super(input); + this.preserveOriginal = preserveOriginal; + this.matchers = new Matcher[patterns.length]; + this.groupCounts = new int[patterns.length]; + this.currentGroup = new int[patterns.length]; + for (int i = 0; i < patterns.length; i++) { + this.matchers[i] = patterns[i].matcher(""); + this.groupCounts[i] = this.matchers[i].groupCount(); + this.currentGroup[i] = -1; + } + } + + private boolean nextCapture() { + int min_offset = Integer.MAX_VALUE; + currentMatcher = -1; + Matcher matcher; + + for (int i = 0; i < matchers.length; i++) { + matcher = matchers[i]; + if (currentGroup[i] == -1) { + currentGroup[i] = matcher.find() ? 1 : 0; + } + if (currentGroup[i] != 0) { + while (currentGroup[i] < groupCounts[i] + 1) { + final int start = matcher.start(currentGroup[i]); + final int end = matcher.end(currentGroup[i]); + if (start == end || preserveOriginal && start == 0 + && spare.length == end) { + currentGroup[i]++; + continue; + } + if (start < min_offset) { + min_offset = start; + currentMatcher = i; + } + break; + } + if (currentGroup[i] == groupCounts[i] + 1) { + currentGroup[i] = -1; + i--; + } + } + } + return currentMatcher != -1; + } + + @Override + public boolean incrementToken() throws IOException { + + if (currentMatcher != -1 && nextCapture()) { + assert state != null; + clearAttributes(); + restoreState(state); + final int start = matchers[currentMatcher] + .start(currentGroup[currentMatcher]); + final int end = matchers[currentMatcher] + .end(currentGroup[currentMatcher]); + + posAttr.setPositionIncrement(0); + charTermAttr.copyBuffer(spare.chars, start, end - start); + currentGroup[currentMatcher]++; + return true; + } + + if (!input.incrementToken()) { + return false; + } + + char[] buffer = charTermAttr.buffer(); + int length = charTermAttr.length(); + spare.copyChars(buffer, 0, length); + state = captureState(); + + for (int i = 0; i < matchers.length; i++) { + matchers[i].reset(spare); + currentGroup[i] = -1; + } + + if (preserveOriginal) { + currentMatcher = 0; + } else if (nextCapture()) { + final int start = matchers[currentMatcher] + .start(currentGroup[currentMatcher]); + final int end = matchers[currentMatcher] + .end(currentGroup[currentMatcher]); + + // if we start at 0 we can simply set the length and save the copy + if (start == 0) { + charTermAttr.setLength(end); + } else { + charTermAttr.copyBuffer(spare.chars, start, end - start); + } + currentGroup[currentMatcher]++; + } + return true; + + } + + @Override + public void reset() throws IOException { + super.reset(); + state = null; + currentMatcher = -1; + } + +} diff --git a/src/main/java/org/elasticsearch/common/settings/ImmutableSettings.java b/src/main/java/org/elasticsearch/common/settings/ImmutableSettings.java index 4b7ef310586..67769a6697b 100644 --- a/src/main/java/org/elasticsearch/common/settings/ImmutableSettings.java +++ b/src/main/java/org/elasticsearch/common/settings/ImmutableSettings.java @@ -352,19 +352,28 @@ public class ImmutableSettings implements Settings { @Override public String[] getAsArray(String settingPrefix) throws SettingsException { - return getAsArray(settingPrefix, Strings.EMPTY_ARRAY); + return getAsArray(settingPrefix, Strings.EMPTY_ARRAY, true); } @Override public String[] getAsArray(String settingPrefix, String[] defaultArray) throws SettingsException { + return getAsArray(settingPrefix, defaultArray, true); + } + + @Override + public String[] getAsArray(String settingPrefix, String[] defaultArray, Boolean commaDelimited) throws SettingsException { List result = Lists.newArrayList(); if (get(settingPrefix) != null) { - String[] strings = Strings.splitStringByCommaToArray(get(settingPrefix)); - if (strings.length > 0) { - for (String string : strings) { - result.add(string.trim()); + if (commaDelimited) { + String[] strings = Strings.splitStringByCommaToArray(get(settingPrefix)); + if (strings.length > 0) { + for (String string : strings) { + result.add(string.trim()); + } } + } else { + result.add(get(settingPrefix).trim()); } } diff --git a/src/main/java/org/elasticsearch/common/settings/Settings.java b/src/main/java/org/elasticsearch/common/settings/Settings.java index 778ce1402b1..aeb9845b647 100644 --- a/src/main/java/org/elasticsearch/common/settings/Settings.java +++ b/src/main/java/org/elasticsearch/common/settings/Settings.java @@ -234,6 +234,21 @@ public interface Settings { * the numbered format. * * @param settingPrefix The setting prefix to load the array by + * @param defaultArray The default array to use if no value is specified + * @param commaDelimited Whether to try to parse a string as a comma-delimited value + * @return The setting array values + * @throws SettingsException + */ + String[] getAsArray(String settingPrefix, String[] defaultArray, Boolean commaDelimited) throws SettingsException; + + /** + * The values associated with a setting prefix as an array. The settings array is in the format of: + * settingPrefix.[index]. + *

+ *

If commaDelimited is true, it will automatically load a comma separated list under the settingPrefix and merge with + * the numbered format. + * + * @param settingPrefix The setting prefix to load the array by * @return The setting array values * @throws SettingsException */ @@ -253,7 +268,7 @@ public interface Settings { String[] getAsArray(String settingPrefix) throws SettingsException; /** - * Retruns a parsed version. + * Returns a parsed version. */ Version getAsVersion(String setting, Version defaultVersion) throws SettingsException; diff --git a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index dee7200baf2..73a47f5f19f 100644 --- a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -485,6 +485,7 @@ public class AnalysisModule extends AbstractModule { tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class); + tokenFiltersBindings.processTokenFilter("pattern_capture", PatternCaptureGroupTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("hyphenation_decompounder", HyphenationCompoundWordTokenFilterFactory.class); diff --git a/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java new file mode 100644 index 00000000000..86b1d2938ad --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java @@ -0,0 +1,63 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.pattern.XPatternCaptureGroupTokenFilter; +import org.apache.lucene.util.Version; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.util.regex.Pattern; + +@AnalysisSettingsRequired +public class PatternCaptureGroupTokenFilterFactory extends AbstractTokenFilterFactory { + private Pattern[] patterns; + private boolean preserveOriginal; + + static { + // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1471347. + assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed"; + } + + + @Inject + public PatternCaptureGroupTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, + @Assisted Settings settings) { + super(index, indexSettings, name, settings); + String[] regexes = settings.getAsArray("patterns",Strings.EMPTY_ARRAY,false); + patterns = new Pattern[regexes.length]; + for (int i = 0; i < regexes.length; i++) { + patterns[i] = Pattern.compile(regexes[i]); + } + + preserveOriginal = settings.getAsBoolean("preserve_original", true); + } + + @Override + public XPatternCaptureGroupTokenFilter create(TokenStream tokenStream) { + return new XPatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns); + } +} diff --git a/src/test/java/org/elasticsearch/test/unit/index/analysis/PatternCaptureTokenFilterTests.java b/src/test/java/org/elasticsearch/test/unit/index/analysis/PatternCaptureTokenFilterTests.java new file mode 100644 index 00000000000..f04176851d4 --- /dev/null +++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/PatternCaptureTokenFilterTests.java @@ -0,0 +1,72 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.test.unit.index.analysis; + +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.index.analysis.AnalysisService; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.testng.annotations.Test; + +import java.io.StringReader; + +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper.assertSimpleTSOutput; + +public class PatternCaptureTokenFilterTests { + + @Test + public void testPatternCaptureTokenFilter() throws Exception { + Index index = new Index("test"); + Settings settings = settingsBuilder().loadFromClasspath("org/elasticsearch/test/unit/index/analysis/pattern_capture.json").build(); + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, settings), + new IndexNameModule(index), + new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class))) + .createChildInjector(parentInjector); + + AnalysisService analysisService = injector.getInstance(AnalysisService.class); + + NamedAnalyzer analyzer1 = analysisService.analyzer("single"); + + assertSimpleTSOutput(analyzer1.tokenStream("test", new StringReader("foobarbaz")), new String[]{"foobarbaz","foobar","foo"}); + + NamedAnalyzer analyzer2 = analysisService.analyzer("multi"); + + assertSimpleTSOutput(analyzer2.tokenStream("test", new StringReader("abc123def")), new String[]{"abc123def","abc","123","def"}); + + NamedAnalyzer analyzer3 = analysisService.analyzer("preserve"); + + assertSimpleTSOutput(analyzer3.tokenStream("test", new StringReader("foobarbaz")), new String[]{"foobar","foo"}); + + } + +} diff --git a/src/test/java/org/elasticsearch/test/unit/index/analysis/pattern_capture.json b/src/test/java/org/elasticsearch/test/unit/index/analysis/pattern_capture.json new file mode 100644 index 00000000000..d82fb987e6e --- /dev/null +++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/pattern_capture.json @@ -0,0 +1,46 @@ +{ + "index": { + "number_of_shards": 1, + "number_of_replicas": 0, + "analysis": { + "filter": { + "single": { + "type": "pattern_capture", + "patterns": "((...)...)" + }, + "multi": { + "type": "pattern_capture", + "patterns": [ + "(\\d+)", + "([a-z]+)" + ] + }, + "preserve": { + "type": "pattern_capture", + "preserve_original": false, + "patterns": "((...)...)" + } + }, + "analyzer": { + "single": { + "tokenizer": "keyword", + "filter": [ + "single" + ] + }, + "multi": { + "tokenizer": "keyword", + "filter": [ + "multi" + ] + }, + "preserve": { + "tokenizer": "keyword", + "filter": [ + "preserve" + ] + } + } + } + } +} \ No newline at end of file