From 6fad75df824a087fb304c766ee8dc61ee178f594 Mon Sep 17 00:00:00 2001 From: Igor Motov Date: Fri, 26 Oct 2012 00:03:16 -0400 Subject: [PATCH] lucene 4: remove Pattern tokenizer and filter --- .../pattern/PatternReplaceFilter.java | 85 ---------- .../analysis/pattern/PatternTokenizer.java | 153 ------------------ 2 files changed, 238 deletions(-) delete mode 100644 src/main/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java delete mode 100644 src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java diff --git a/src/main/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java b/src/main/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java deleted file mode 100644 index ef7565215b1..00000000000 --- a/src/main/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.lucene.analysis.pattern; - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; - -import java.io.IOException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * A TokenFilter which applies a Pattern to each token in the stream, - * replacing match occurances with the specified replacement string. - *

- *

- * Note: Depending on the input and the pattern used and the input - * TokenStream, this TokenFilter may produce Tokens whose text is the empty - * string. - *

- * - * @see Pattern - */ -public final class PatternReplaceFilter extends TokenFilter { - private final Pattern p; - private final String replacement; - private final boolean all; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final Matcher m; - - /** - * Constructs an instance to replace either the first, or all occurances - * - * @param in the TokenStream to process - * @param p the patterm to apply to each Token - * @param replacement the "replacement string" to substitute, if null a - * blank string will be used. Note that this is not the literal - * string that will be used, '$' and '\' have special meaning. - * @param all if true, all matches will be replaced otherwise just the first match. - * @see Matcher#quoteReplacement - */ - public PatternReplaceFilter(TokenStream in, - Pattern p, - String replacement, - boolean all) { - super(in); - this.p = p; - this.replacement = (null == replacement) ? "" : replacement; - this.all = all; - this.m = p.matcher(termAtt); - } - - @Override - public boolean incrementToken() throws IOException { - if (!input.incrementToken()) return false; - - m.reset(); - if (m.find()) { - // replaceAll/replaceFirst will reset() this previous find. - String transformed = all ? m.replaceAll(replacement) : m.replaceFirst(replacement); - termAtt.setEmpty().append(transformed); - } - - return true; - } - -} diff --git a/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java b/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java deleted file mode 100644 index 09d32506976..00000000000 --- a/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Licensed to ElasticSearch and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. ElasticSearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.lucene.analysis.pattern; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; - -import java.io.IOException; -import java.io.Reader; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * This tokenizer uses regex pattern matching to construct distinct tokens - * for the input stream. It takes two arguments: "pattern" and "group". - *

- *

- *

- * group=-1 (the default) is equivalent to "split". In this case, the tokens will - * be equivalent to the output from (without empty tokens): - * {@link String#split(java.lang.String)} - *

- *

- * Using group >= 0 selects the matching group as the token. For example, if you have:
- *

- *  pattern = \'([^\']+)\'
- *  group = 0
- *  input = aaa 'bbb' 'ccc'
- * 
- * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input - * but using group=1, the output would be: bbb and ccc (no ' marks) - *

- *

NOTE: This Tokenizer does not output tokens that are of zero length.

- * - * @see Pattern - */ -public final class PatternTokenizer extends Tokenizer { - - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - - private final StringBuilder str = new StringBuilder(); - private int index; - - private final Pattern pattern; - private final int group; - private final Matcher matcher; - - /** - * creates a new PatternTokenizer returning tokens from group (-1 for split functionality) - */ - public PatternTokenizer(Reader input, Pattern pattern, int group) throws IOException { - super(input); - this.pattern = pattern; - this.group = group; - fillBuffer(str, input); - matcher = pattern.matcher(str); - index = 0; - } - - @Override - public boolean incrementToken() throws IOException { - if (index >= str.length()) return false; - clearAttributes(); - if (group >= 0) { - - // match a specific group - while (matcher.find()) { - index = matcher.start(group); - final int endIndex = matcher.end(group); - if (index == endIndex) continue; - termAtt.setEmpty().append(str, index, endIndex); - offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex)); - return true; - } - - index = Integer.MAX_VALUE; // mark exhausted - return false; - - } else { - - // String.split() functionality - while (matcher.find()) { - if (matcher.start() - index > 0) { - // found a non-zero-length token - termAtt.setEmpty().append(str, index, matcher.start()); - offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start())); - index = matcher.end(); - return true; - } - - index = matcher.end(); - } - - if (str.length() - index == 0) { - index = Integer.MAX_VALUE; // mark exhausted - return false; - } - - termAtt.setEmpty().append(str, index, str.length()); - offsetAtt.setOffset(correctOffset(index), correctOffset(str.length())); - index = Integer.MAX_VALUE; // mark exhausted - return true; - } - } - - @Override - public void end() throws IOException { - final int ofs = correctOffset(str.length()); - offsetAtt.setOffset(ofs, ofs); - } - - @Override - public void reset(Reader input) throws IOException { - super.reset(input); - fillBuffer(str, input); - matcher.reset(str); - index = 0; - } - - // TODO: we should see if we can make this tokenizer work without reading - // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ? - final char[] buffer = new char[8192]; - - private void fillBuffer(StringBuilder sb, Reader input) throws IOException { - int len; - sb.setLength(0); - while ((len = input.read(buffer)) > 0) { - sb.append(buffer, 0, len); - } - } -}