From 69a7f8d71dd614bf44baf9d99f2b82306cdd4e34 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 16 Jul 2013 22:14:33 +0200 Subject: [PATCH] Removed XPatternCaptureGroupTokenFilter --- .../XPatternCaptureGroupTokenFilter.java | 200 ------------------ ...PatternCaptureGroupTokenFilterFactory.java | 15 +- 2 files changed, 6 insertions(+), 209 deletions(-) delete mode 100644 src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java diff --git a/src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java b/src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java deleted file mode 100644 index 6d6dea8d032..00000000000 --- a/src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java +++ /dev/null @@ -1,200 +0,0 @@ -package org.apache.lucene.analysis.pattern; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -import java.io.IOException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.CharsRef; - -/** - * CaptureGroup uses Java regexes to emit multiple tokens - one for each capture - * group in one or more patterns. - * - *

- * For example, a pattern like: - *

- * - *

- * "(https?://([a-zA-Z\-_0-9.]+))" - *

- * - *

- * when matched against the string "http://www.foo.com/index" would return the - * tokens "https://www.foo.com" and "www.foo.com". - *

- * - *

- * If none of the patterns match, or if preserveOriginal is true, the original - * token will be preserved. - *

- *

- * Each pattern is matched as often as it can be, so the pattern - * "(...)", when matched against "abcdefghi" would - * produce ["abc","def","ghi"] - *

- *

- * A camelCaseFilter could be written as: - *

- *

- * - * "([A-Z]{2,})",
- * "(?<![A-Z])([A-Z][a-z]+)",
- * "(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)",
- * "([0-9]+)" - *
- *

- *

- * plus if {@link #preserveOriginal} is true, it would also return - * "camelCaseFilter - *

- */ -public final class XPatternCaptureGroupTokenFilter extends TokenFilter { - - private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class); - private State state; - private final Matcher[] matchers; - private final CharsRef spare = new CharsRef(); - private final int[] groupCounts; - private final boolean preserveOriginal; - private int[] currentGroup; - private int currentMatcher; - - /** - * @param input - * the input {@link TokenStream} - * @param preserveOriginal - * set to true to return the original token even if one of the - * patterns matches - * @param patterns - * an array of {@link Pattern} objects to match against each token - */ - - public XPatternCaptureGroupTokenFilter(TokenStream input, - boolean preserveOriginal, Pattern... patterns) { - super(input); - this.preserveOriginal = preserveOriginal; - this.matchers = new Matcher[patterns.length]; - this.groupCounts = new int[patterns.length]; - this.currentGroup = new int[patterns.length]; - for (int i = 0; i < patterns.length; i++) { - this.matchers[i] = patterns[i].matcher(""); - this.groupCounts[i] = this.matchers[i].groupCount(); - this.currentGroup[i] = -1; - } - } - - private boolean nextCapture() { - int min_offset = Integer.MAX_VALUE; - currentMatcher = -1; - Matcher matcher; - - for (int i = 0; i < matchers.length; i++) { - matcher = matchers[i]; - if (currentGroup[i] == -1) { - currentGroup[i] = matcher.find() ? 1 : 0; - } - if (currentGroup[i] != 0) { - while (currentGroup[i] < groupCounts[i] + 1) { - final int start = matcher.start(currentGroup[i]); - final int end = matcher.end(currentGroup[i]); - if (start == end || preserveOriginal && start == 0 - && spare.length == end) { - currentGroup[i]++; - continue; - } - if (start < min_offset) { - min_offset = start; - currentMatcher = i; - } - break; - } - if (currentGroup[i] == groupCounts[i] + 1) { - currentGroup[i] = -1; - i--; - } - } - } - return currentMatcher != -1; - } - - @Override - public boolean incrementToken() throws IOException { - - if (currentMatcher != -1 && nextCapture()) { - assert state != null; - clearAttributes(); - restoreState(state); - final int start = matchers[currentMatcher] - .start(currentGroup[currentMatcher]); - final int end = matchers[currentMatcher] - .end(currentGroup[currentMatcher]); - - posAttr.setPositionIncrement(0); - charTermAttr.copyBuffer(spare.chars, start, end - start); - currentGroup[currentMatcher]++; - return true; - } - - if (!input.incrementToken()) { - return false; - } - - char[] buffer = charTermAttr.buffer(); - int length = charTermAttr.length(); - spare.copyChars(buffer, 0, length); - state = captureState(); - - for (int i = 0; i < matchers.length; i++) { - matchers[i].reset(spare); - currentGroup[i] = -1; - } - - if (preserveOriginal) { - currentMatcher = 0; - } else if (nextCapture()) { - final int start = matchers[currentMatcher] - .start(currentGroup[currentMatcher]); - final int end = matchers[currentMatcher] - .end(currentGroup[currentMatcher]); - - // if we start at 0 we can simply set the length and save the copy - if (start == 0) { - charTermAttr.setLength(end); - } else { - charTermAttr.copyBuffer(spare.chars, start, end - start); - } - currentGroup[currentMatcher]++; - } - return true; - - } - - @Override - public void reset() throws IOException { - super.reset(); - state = null; - currentMatcher = -1; - } - -} diff --git a/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java index 86b1d2938ad..6d84e6a8edc 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java @@ -19,13 +19,14 @@ package org.elasticsearch.index.analysis; +import org.apache.lucene.analysis.pattern.PatternCaptureGroupTokenFilter; + +import org.apache.lucene.analysis.TokenFilter; + import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.pattern.XPatternCaptureGroupTokenFilter; -import org.apache.lucene.util.Version; import org.elasticsearch.common.Strings; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; -import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettings; @@ -37,10 +38,6 @@ public class PatternCaptureGroupTokenFilterFactory extends AbstractTokenFilterFa private Pattern[] patterns; private boolean preserveOriginal; - static { - // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1471347. - assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed"; - } @Inject @@ -57,7 +54,7 @@ public class PatternCaptureGroupTokenFilterFactory extends AbstractTokenFilterFa } @Override - public XPatternCaptureGroupTokenFilter create(TokenStream tokenStream) { - return new XPatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns); + public TokenFilter create(TokenStream tokenStream) { + return new PatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns); } }