Merge pull request #3341 from clintongormley/pattern_capture

Added the "pattern_capture" token filter from Lucene 4.4
2013-07-16 09:20:13 -07:00 · 2013-07-16 09:20:13 -07:00 · 1bc8f82d0a
parent 933fd50466 16e137ebbc
commit 1bc8f82d0a
7 changed files with 412 additions and 6 deletions
--- a/src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java
+++ b/src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java
@ -0,0 +1,200 @@
 package org.apache.lucene.analysis.pattern;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.CharsRef;
 /**
 * CaptureGroup uses Java regexes to emit multiple tokens - one for each capture
 * group in one or more patterns.
 *
 * <p>
 * For example, a pattern like:
 * </p>
 *
 * <p>
 * <code>"(https?://([a-zA-Z\-_0-9.]+))"</code>
 * </p>
 *
 * <p>
 * when matched against the string "http://www.foo.com/index" would return the
 * tokens "https://www.foo.com" and "www.foo.com".
 * </p>
 *
 * <p>
 * If none of the patterns match, or if preserveOriginal is true, the original
 * token will be preserved.
 * </p>
 * <p>
 * Each pattern is matched as often as it can be, so the pattern
 * <code> "(...)"</code>, when matched against <code>"abcdefghi"</code> would
 * produce <code>["abc","def","ghi"]</code>
 * </p>
 * <p>
 * A camelCaseFilter could be written as:
 * </p>
 * <p>
 * <code>
 *   "([A-Z]{2,})",                                 <br />
 *   "(?&lt;![A-Z])([A-Z][a-z]+)",                     <br />
 *   "(?:^|\\b|(?&lt;=[0-9_])|(?&lt;=[A-Z]{2}))([a-z]+)", <br />
 *   "([0-9]+)"
 * </code>
 * </p>
 * <p>
 * plus if {@link #preserveOriginal} is true, it would also return
 * <code>"camelCaseFilter</code>
 * </p>
 */
 public final class XPatternCaptureGroupTokenFilter extends TokenFilter {
  private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
  private State state;
  private final Matcher[] matchers;
  private final CharsRef spare = new CharsRef();
  private final int[] groupCounts;
  private final boolean preserveOriginal;
  private int[] currentGroup;
  private int currentMatcher;
  /**
   * @param input
   *          the input {@link TokenStream}
   * @param preserveOriginal
   *          set to true to return the original token even if one of the
   *          patterns matches
   * @param patterns
   *          an array of {@link Pattern} objects to match against each token
   */
  public XPatternCaptureGroupTokenFilter(TokenStream input,
      boolean preserveOriginal, Pattern... patterns) {
    super(input);
    this.preserveOriginal = preserveOriginal;
    this.matchers = new Matcher[patterns.length];
    this.groupCounts = new int[patterns.length];
    this.currentGroup = new int[patterns.length];
    for (int i = 0; i < patterns.length; i++) {
      this.matchers[i] = patterns[i].matcher("");
      this.groupCounts[i] = this.matchers[i].groupCount();
      this.currentGroup[i] = -1;
    }
  }
  private boolean nextCapture() {
    int min_offset = Integer.MAX_VALUE;
    currentMatcher = -1;
    Matcher matcher;
    for (int i = 0; i < matchers.length; i++) {
      matcher = matchers[i];
      if (currentGroup[i] == -1) {
        currentGroup[i] = matcher.find() ? 1 : 0;
      }
      if (currentGroup[i] != 0) {
        while (currentGroup[i] < groupCounts[i] + 1) {
          final int start = matcher.start(currentGroup[i]);
          final int end = matcher.end(currentGroup[i]);
          if (start == end || preserveOriginal && start == 0
              && spare.length == end) {
            currentGroup[i]++;
            continue;
          }
          if (start < min_offset) {
            min_offset = start;
            currentMatcher = i;
          }
          break;
        }
        if (currentGroup[i] == groupCounts[i] + 1) {
          currentGroup[i] = -1;
          i--;
        }
      }
    }
    return currentMatcher != -1;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (currentMatcher != -1 && nextCapture()) {
      assert state != null;
      clearAttributes();
      restoreState(state);
      final int start = matchers[currentMatcher]
          .start(currentGroup[currentMatcher]);
      final int end = matchers[currentMatcher]
          .end(currentGroup[currentMatcher]);
      posAttr.setPositionIncrement(0);
      charTermAttr.copyBuffer(spare.chars, start, end - start);
      currentGroup[currentMatcher]++;
      return true;
    }
    if (!input.incrementToken()) {
      return false;
    }
    char[] buffer = charTermAttr.buffer();
    int length = charTermAttr.length();
    spare.copyChars(buffer, 0, length);
    state = captureState();
    for (int i = 0; i < matchers.length; i++) {
      matchers[i].reset(spare);
      currentGroup[i] = -1;
    }
    if (preserveOriginal) {
      currentMatcher = 0;
    } else if (nextCapture()) {
      final int start = matchers[currentMatcher]
          .start(currentGroup[currentMatcher]);
      final int end = matchers[currentMatcher]
          .end(currentGroup[currentMatcher]);
      // if we start at 0 we can simply set the length and save the copy
      if (start == 0) {
        charTermAttr.setLength(end);
      } else {
        charTermAttr.copyBuffer(spare.chars, start, end - start);
      }
      currentGroup[currentMatcher]++;
    }
    return true;
  }
  @Override
  public void reset() throws IOException {
    super.reset();
    state = null;
    currentMatcher = -1;
  }
 }
--- a/src/main/java/org/elasticsearch/common/settings/ImmutableSettings.java
+++ b/src/main/java/org/elasticsearch/common/settings/ImmutableSettings.java
@ -352,20 +352,29 @@ public class ImmutableSettings implements Settings {
    @Override
    public String[] getAsArray(String settingPrefix) throws SettingsException {
-        return getAsArray(settingPrefix, Strings.EMPTY_ARRAY);
+        return getAsArray(settingPrefix, Strings.EMPTY_ARRAY, true);
    }
    @Override
    public String[] getAsArray(String settingPrefix, String[] defaultArray) throws SettingsException {
        return getAsArray(settingPrefix, defaultArray, true);
    }
    @Override
    public String[] getAsArray(String settingPrefix, String[] defaultArray, Boolean commaDelimited) throws SettingsException {
        List<String> result = Lists.newArrayList();
        if (get(settingPrefix) != null) {
            if (commaDelimited) {
                String[] strings = Strings.splitStringByCommaToArray(get(settingPrefix));
                if (strings.length > 0) {
                    for (String string : strings) {
                        result.add(string.trim());
                    }
                }
            } else {
                result.add(get(settingPrefix).trim());
            }
        }
        int counter = 0;
--- a/src/main/java/org/elasticsearch/common/settings/Settings.java
+++ b/src/main/java/org/elasticsearch/common/settings/Settings.java
@ -234,6 +234,21 @@ public interface Settings {
     * the numbered format.
     *
     * @param settingPrefix The setting prefix to load the array by
     * @param defaultArray The default array to use if no value is specified
     * @param commaDelimited Whether to try to parse a string as a comma-delimited value
     * @return The setting array values
     * @throws SettingsException
     */
    String[] getAsArray(String settingPrefix, String[] defaultArray, Boolean commaDelimited) throws SettingsException;
    /**
     * The values associated with a setting prefix as an array. The settings array is in the format of:
     * <tt>settingPrefix.[index]</tt>.
     * <p/>
     * <p>If commaDelimited is true, it will automatically load a comma separated list under the settingPrefix and merge with
     * the numbered format.
     *
     * @param settingPrefix The setting prefix to load the array by
     * @return The setting array values
     * @throws SettingsException
     */
@ -253,7 +268,7 @@ public interface Settings {
    String[] getAsArray(String settingPrefix) throws SettingsException;
    /**
-     * Retruns a parsed version.
+     * Returns a parsed version.
     */
    Version getAsVersion(String setting, Version defaultVersion) throws SettingsException;
--- a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
+++ b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
@ -485,6 +485,7 @@ public class AnalysisModule extends AbstractModule {
            tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("pattern_capture", PatternCaptureGroupTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("hyphenation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
--- a/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java
@ -0,0 +1,63 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.pattern.XPatternCaptureGroupTokenFilter;
 import org.apache.lucene.util.Version;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.settings.IndexSettings;
 import java.util.regex.Pattern;
@AnalysisSettingsRequired
 public class PatternCaptureGroupTokenFilterFactory extends AbstractTokenFilterFactory {
    private Pattern[] patterns;
    private boolean preserveOriginal;
    static {
        // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1471347.
        assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
    }
    @Inject
    public PatternCaptureGroupTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name,
            @Assisted Settings settings) {
        super(index, indexSettings, name, settings);
        String[] regexes = settings.getAsArray("patterns",Strings.EMPTY_ARRAY,false);
        patterns = new Pattern[regexes.length];
        for (int i = 0; i < regexes.length; i++) {
            patterns[i] = Pattern.compile(regexes[i]);
        }
        preserveOriginal = settings.getAsBoolean("preserve_original", true);
    }
    @Override
    public XPatternCaptureGroupTokenFilter create(TokenStream tokenStream) {
        return new XPatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns);
    }
 }
--- a/src/test/java/org/elasticsearch/test/unit/index/analysis/PatternCaptureTokenFilterTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/PatternCaptureTokenFilterTests.java
@ -0,0 +1,72 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.test.unit.index.analysis;
 import org.elasticsearch.common.inject.Injector;
 import org.elasticsearch.common.inject.ModulesBuilder;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.settings.SettingsModule;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.env.EnvironmentModule;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexNameModule;
 import org.elasticsearch.index.analysis.AnalysisModule;
 import org.elasticsearch.index.analysis.AnalysisService;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.index.settings.IndexSettingsModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisService;
 import org.testng.annotations.Test;
 import java.io.StringReader;
 import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
 import static org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper.assertSimpleTSOutput;
 public class PatternCaptureTokenFilterTests {
    @Test
    public void testPatternCaptureTokenFilter() throws Exception {
        Index index = new Index("test");
        Settings settings = settingsBuilder().loadFromClasspath("org/elasticsearch/test/unit/index/analysis/pattern_capture.json").build();
        Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
        Injector injector = new ModulesBuilder().add(
                new IndexSettingsModule(index, settings),
                new IndexNameModule(index),
                new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)))
                .createChildInjector(parentInjector);
        AnalysisService analysisService = injector.getInstance(AnalysisService.class);
        NamedAnalyzer analyzer1 = analysisService.analyzer("single");
        assertSimpleTSOutput(analyzer1.tokenStream("test", new StringReader("foobarbaz")), new String[]{"foobarbaz","foobar","foo"});
        NamedAnalyzer analyzer2 = analysisService.analyzer("multi");
        assertSimpleTSOutput(analyzer2.tokenStream("test", new StringReader("abc123def")), new String[]{"abc123def","abc","123","def"});
        NamedAnalyzer analyzer3 = analysisService.analyzer("preserve");
        assertSimpleTSOutput(analyzer3.tokenStream("test", new StringReader("foobarbaz")), new String[]{"foobar","foo"});
    }
 }
--- a/src/test/java/org/elasticsearch/test/unit/index/analysis/pattern_capture.json
+++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/pattern_capture.json
@ -0,0 +1,46 @@
 {
   "index": {
      "number_of_shards": 1,
      "number_of_replicas": 0,
      "analysis": {
         "filter": {
            "single": {
               "type": "pattern_capture",
               "patterns": "((...)...)"
            },
            "multi": {
               "type": "pattern_capture",
               "patterns": [
                  "(\\d+)",
                  "([a-z]+)"
               ]
            },
            "preserve": {
               "type": "pattern_capture",
               "preserve_original": false,
               "patterns": "((...)...)"
            }
         },
         "analyzer": {
            "single": {
               "tokenizer": "keyword",
               "filter": [
                  "single"
               ]
            },
            "multi": {
               "tokenizer": "keyword",
               "filter": [
                  "multi"
               ]
            },
            "preserve": {
               "tokenizer": "keyword",
               "filter": [
                  "preserve"
               ]
            }
         }
      }
   }
 }