Merge pull request #3341 from clintongormley/pattern_capture
Added the "pattern_capture" token filter from Lucene 4.4
This commit is contained in:
commit
1bc8f82d0a
|
@ -0,0 +1,200 @@
|
|||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* CaptureGroup uses Java regexes to emit multiple tokens - one for each capture
|
||||
* group in one or more patterns.
|
||||
*
|
||||
* <p>
|
||||
* For example, a pattern like:
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* <code>"(https?://([a-zA-Z\-_0-9.]+))"</code>
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* when matched against the string "http://www.foo.com/index" would return the
|
||||
* tokens "https://www.foo.com" and "www.foo.com".
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* If none of the patterns match, or if preserveOriginal is true, the original
|
||||
* token will be preserved.
|
||||
* </p>
|
||||
* <p>
|
||||
* Each pattern is matched as often as it can be, so the pattern
|
||||
* <code> "(...)"</code>, when matched against <code>"abcdefghi"</code> would
|
||||
* produce <code>["abc","def","ghi"]</code>
|
||||
* </p>
|
||||
* <p>
|
||||
* A camelCaseFilter could be written as:
|
||||
* </p>
|
||||
* <p>
|
||||
* <code>
|
||||
* "([A-Z]{2,})", <br />
|
||||
* "(?<![A-Z])([A-Z][a-z]+)", <br />
|
||||
* "(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)", <br />
|
||||
* "([0-9]+)"
|
||||
* </code>
|
||||
* </p>
|
||||
* <p>
|
||||
* plus if {@link #preserveOriginal} is true, it would also return
|
||||
* <code>"camelCaseFilter</code>
|
||||
* </p>
|
||||
*/
|
||||
public final class XPatternCaptureGroupTokenFilter extends TokenFilter {
|
||||
|
||||
private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
|
||||
private State state;
|
||||
private final Matcher[] matchers;
|
||||
private final CharsRef spare = new CharsRef();
|
||||
private final int[] groupCounts;
|
||||
private final boolean preserveOriginal;
|
||||
private int[] currentGroup;
|
||||
private int currentMatcher;
|
||||
|
||||
/**
|
||||
* @param input
|
||||
* the input {@link TokenStream}
|
||||
* @param preserveOriginal
|
||||
* set to true to return the original token even if one of the
|
||||
* patterns matches
|
||||
* @param patterns
|
||||
* an array of {@link Pattern} objects to match against each token
|
||||
*/
|
||||
|
||||
public XPatternCaptureGroupTokenFilter(TokenStream input,
|
||||
boolean preserveOriginal, Pattern... patterns) {
|
||||
super(input);
|
||||
this.preserveOriginal = preserveOriginal;
|
||||
this.matchers = new Matcher[patterns.length];
|
||||
this.groupCounts = new int[patterns.length];
|
||||
this.currentGroup = new int[patterns.length];
|
||||
for (int i = 0; i < patterns.length; i++) {
|
||||
this.matchers[i] = patterns[i].matcher("");
|
||||
this.groupCounts[i] = this.matchers[i].groupCount();
|
||||
this.currentGroup[i] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean nextCapture() {
|
||||
int min_offset = Integer.MAX_VALUE;
|
||||
currentMatcher = -1;
|
||||
Matcher matcher;
|
||||
|
||||
for (int i = 0; i < matchers.length; i++) {
|
||||
matcher = matchers[i];
|
||||
if (currentGroup[i] == -1) {
|
||||
currentGroup[i] = matcher.find() ? 1 : 0;
|
||||
}
|
||||
if (currentGroup[i] != 0) {
|
||||
while (currentGroup[i] < groupCounts[i] + 1) {
|
||||
final int start = matcher.start(currentGroup[i]);
|
||||
final int end = matcher.end(currentGroup[i]);
|
||||
if (start == end || preserveOriginal && start == 0
|
||||
&& spare.length == end) {
|
||||
currentGroup[i]++;
|
||||
continue;
|
||||
}
|
||||
if (start < min_offset) {
|
||||
min_offset = start;
|
||||
currentMatcher = i;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (currentGroup[i] == groupCounts[i] + 1) {
|
||||
currentGroup[i] = -1;
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
return currentMatcher != -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
|
||||
if (currentMatcher != -1 && nextCapture()) {
|
||||
assert state != null;
|
||||
clearAttributes();
|
||||
restoreState(state);
|
||||
final int start = matchers[currentMatcher]
|
||||
.start(currentGroup[currentMatcher]);
|
||||
final int end = matchers[currentMatcher]
|
||||
.end(currentGroup[currentMatcher]);
|
||||
|
||||
posAttr.setPositionIncrement(0);
|
||||
charTermAttr.copyBuffer(spare.chars, start, end - start);
|
||||
currentGroup[currentMatcher]++;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char[] buffer = charTermAttr.buffer();
|
||||
int length = charTermAttr.length();
|
||||
spare.copyChars(buffer, 0, length);
|
||||
state = captureState();
|
||||
|
||||
for (int i = 0; i < matchers.length; i++) {
|
||||
matchers[i].reset(spare);
|
||||
currentGroup[i] = -1;
|
||||
}
|
||||
|
||||
if (preserveOriginal) {
|
||||
currentMatcher = 0;
|
||||
} else if (nextCapture()) {
|
||||
final int start = matchers[currentMatcher]
|
||||
.start(currentGroup[currentMatcher]);
|
||||
final int end = matchers[currentMatcher]
|
||||
.end(currentGroup[currentMatcher]);
|
||||
|
||||
// if we start at 0 we can simply set the length and save the copy
|
||||
if (start == 0) {
|
||||
charTermAttr.setLength(end);
|
||||
} else {
|
||||
charTermAttr.copyBuffer(spare.chars, start, end - start);
|
||||
}
|
||||
currentGroup[currentMatcher]++;
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
state = null;
|
||||
currentMatcher = -1;
|
||||
}
|
||||
|
||||
}
|
|
@ -352,20 +352,29 @@ public class ImmutableSettings implements Settings {
|
|||
|
||||
@Override
|
||||
public String[] getAsArray(String settingPrefix) throws SettingsException {
|
||||
return getAsArray(settingPrefix, Strings.EMPTY_ARRAY);
|
||||
return getAsArray(settingPrefix, Strings.EMPTY_ARRAY, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] getAsArray(String settingPrefix, String[] defaultArray) throws SettingsException {
|
||||
return getAsArray(settingPrefix, defaultArray, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] getAsArray(String settingPrefix, String[] defaultArray, Boolean commaDelimited) throws SettingsException {
|
||||
List<String> result = Lists.newArrayList();
|
||||
|
||||
if (get(settingPrefix) != null) {
|
||||
if (commaDelimited) {
|
||||
String[] strings = Strings.splitStringByCommaToArray(get(settingPrefix));
|
||||
if (strings.length > 0) {
|
||||
for (String string : strings) {
|
||||
result.add(string.trim());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result.add(get(settingPrefix).trim());
|
||||
}
|
||||
}
|
||||
|
||||
int counter = 0;
|
||||
|
|
|
@ -234,6 +234,21 @@ public interface Settings {
|
|||
* the numbered format.
|
||||
*
|
||||
* @param settingPrefix The setting prefix to load the array by
|
||||
* @param defaultArray The default array to use if no value is specified
|
||||
* @param commaDelimited Whether to try to parse a string as a comma-delimited value
|
||||
* @return The setting array values
|
||||
* @throws SettingsException
|
||||
*/
|
||||
String[] getAsArray(String settingPrefix, String[] defaultArray, Boolean commaDelimited) throws SettingsException;
|
||||
|
||||
/**
|
||||
* The values associated with a setting prefix as an array. The settings array is in the format of:
|
||||
* <tt>settingPrefix.[index]</tt>.
|
||||
* <p/>
|
||||
* <p>If commaDelimited is true, it will automatically load a comma separated list under the settingPrefix and merge with
|
||||
* the numbered format.
|
||||
*
|
||||
* @param settingPrefix The setting prefix to load the array by
|
||||
* @return The setting array values
|
||||
* @throws SettingsException
|
||||
*/
|
||||
|
@ -253,7 +268,7 @@ public interface Settings {
|
|||
String[] getAsArray(String settingPrefix) throws SettingsException;
|
||||
|
||||
/**
|
||||
* Retruns a parsed version.
|
||||
* Returns a parsed version.
|
||||
*/
|
||||
Version getAsVersion(String setting, Version defaultVersion) throws SettingsException;
|
||||
|
||||
|
|
|
@ -485,6 +485,7 @@ public class AnalysisModule extends AbstractModule {
|
|||
tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);
|
||||
|
||||
tokenFiltersBindings.processTokenFilter("pattern_capture", PatternCaptureGroupTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("hyphenation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.pattern.XPatternCaptureGroupTokenFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@AnalysisSettingsRequired
|
||||
public class PatternCaptureGroupTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
private Pattern[] patterns;
|
||||
private boolean preserveOriginal;
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1471347.
|
||||
assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
|
||||
}
|
||||
|
||||
|
||||
@Inject
|
||||
public PatternCaptureGroupTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name,
|
||||
@Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
String[] regexes = settings.getAsArray("patterns",Strings.EMPTY_ARRAY,false);
|
||||
patterns = new Pattern[regexes.length];
|
||||
for (int i = 0; i < regexes.length; i++) {
|
||||
patterns[i] = Pattern.compile(regexes[i]);
|
||||
}
|
||||
|
||||
preserveOriginal = settings.getAsBoolean("preserve_original", true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public XPatternCaptureGroupTokenFilter create(TokenStream tokenStream) {
|
||||
return new XPatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.test.unit.index.analysis;
|
||||
|
||||
import org.elasticsearch.common.inject.Injector;
|
||||
import org.elasticsearch.common.inject.ModulesBuilder;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.settings.SettingsModule;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.env.EnvironmentModule;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.IndexNameModule;
|
||||
import org.elasticsearch.index.analysis.AnalysisModule;
|
||||
import org.elasticsearch.index.analysis.AnalysisService;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.settings.IndexSettingsModule;
|
||||
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
|
||||
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
|
||||
import static org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper.assertSimpleTSOutput;
|
||||
|
||||
public class PatternCaptureTokenFilterTests {
|
||||
|
||||
@Test
|
||||
public void testPatternCaptureTokenFilter() throws Exception {
|
||||
Index index = new Index("test");
|
||||
Settings settings = settingsBuilder().loadFromClasspath("org/elasticsearch/test/unit/index/analysis/pattern_capture.json").build();
|
||||
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
|
||||
Injector injector = new ModulesBuilder().add(
|
||||
new IndexSettingsModule(index, settings),
|
||||
new IndexNameModule(index),
|
||||
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)))
|
||||
.createChildInjector(parentInjector);
|
||||
|
||||
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
|
||||
|
||||
NamedAnalyzer analyzer1 = analysisService.analyzer("single");
|
||||
|
||||
assertSimpleTSOutput(analyzer1.tokenStream("test", new StringReader("foobarbaz")), new String[]{"foobarbaz","foobar","foo"});
|
||||
|
||||
NamedAnalyzer analyzer2 = analysisService.analyzer("multi");
|
||||
|
||||
assertSimpleTSOutput(analyzer2.tokenStream("test", new StringReader("abc123def")), new String[]{"abc123def","abc","123","def"});
|
||||
|
||||
NamedAnalyzer analyzer3 = analysisService.analyzer("preserve");
|
||||
|
||||
assertSimpleTSOutput(analyzer3.tokenStream("test", new StringReader("foobarbaz")), new String[]{"foobar","foo"});
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
{
|
||||
"index": {
|
||||
"number_of_shards": 1,
|
||||
"number_of_replicas": 0,
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"single": {
|
||||
"type": "pattern_capture",
|
||||
"patterns": "((...)...)"
|
||||
},
|
||||
"multi": {
|
||||
"type": "pattern_capture",
|
||||
"patterns": [
|
||||
"(\\d+)",
|
||||
"([a-z]+)"
|
||||
]
|
||||
},
|
||||
"preserve": {
|
||||
"type": "pattern_capture",
|
||||
"preserve_original": false,
|
||||
"patterns": "((...)...)"
|
||||
}
|
||||
},
|
||||
"analyzer": {
|
||||
"single": {
|
||||
"tokenizer": "keyword",
|
||||
"filter": [
|
||||
"single"
|
||||
]
|
||||
},
|
||||
"multi": {
|
||||
"tokenizer": "keyword",
|
||||
"filter": [
|
||||
"multi"
|
||||
]
|
||||
},
|
||||
"preserve": {
|
||||
"tokenizer": "keyword",
|
||||
"filter": [
|
||||
"preserve"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue