From 6fad75df824a087fb304c766ee8dc61ee178f594 Mon Sep 17 00:00:00 2001
From: Igor Motov
Date: Fri, 26 Oct 2012 00:03:16 -0400
Subject: [PATCH] lucene 4: remove Pattern tokenizer and filter
---
.../pattern/PatternReplaceFilter.java | 85 ----------
.../analysis/pattern/PatternTokenizer.java | 153 ------------------
2 files changed, 238 deletions(-)
delete mode 100644 src/main/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
delete mode 100644 src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
diff --git a/src/main/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java b/src/main/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
deleted file mode 100644
index ef7565215b1..00000000000
--- a/src/main/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to ElasticSearch and Shay Banon under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. ElasticSearch licenses this
- * file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.lucene.analysis.pattern;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
-import java.io.IOException;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * A TokenFilter which applies a Pattern to each token in the stream,
- * replacing match occurances with the specified replacement string.
- *
- *
- * Note: Depending on the input and the pattern used and the input
- * TokenStream, this TokenFilter may produce Tokens whose text is the empty
- * string.
- *
- *
- * @see Pattern
- */
-public final class PatternReplaceFilter extends TokenFilter {
- private final Pattern p;
- private final String replacement;
- private final boolean all;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final Matcher m;
-
- /**
- * Constructs an instance to replace either the first, or all occurances
- *
- * @param in the TokenStream to process
- * @param p the patterm to apply to each Token
- * @param replacement the "replacement string" to substitute, if null a
- * blank string will be used. Note that this is not the literal
- * string that will be used, '$' and '\' have special meaning.
- * @param all if true, all matches will be replaced otherwise just the first match.
- * @see Matcher#quoteReplacement
- */
- public PatternReplaceFilter(TokenStream in,
- Pattern p,
- String replacement,
- boolean all) {
- super(in);
- this.p = p;
- this.replacement = (null == replacement) ? "" : replacement;
- this.all = all;
- this.m = p.matcher(termAtt);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (!input.incrementToken()) return false;
-
- m.reset();
- if (m.find()) {
- // replaceAll/replaceFirst will reset() this previous find.
- String transformed = all ? m.replaceAll(replacement) : m.replaceFirst(replacement);
- termAtt.setEmpty().append(transformed);
- }
-
- return true;
- }
-
-}
diff --git a/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java b/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
deleted file mode 100644
index 09d32506976..00000000000
--- a/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to ElasticSearch and Shay Banon under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. ElasticSearch licenses this
- * file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.lucene.analysis.pattern;
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * This tokenizer uses regex pattern matching to construct distinct tokens
- * for the input stream. It takes two arguments: "pattern" and "group".
- *
- *
- * - "pattern" is the regular expression.
- * - "group" says which group to extract into tokens.
- *
- *
- * group=-1 (the default) is equivalent to "split". In this case, the tokens will
- * be equivalent to the output from (without empty tokens):
- * {@link String#split(java.lang.String)}
- *
- *
- * Using group >= 0 selects the matching group as the token. For example, if you have:
- *
- * pattern = \'([^\']+)\'
- * group = 0
- * input = aaa 'bbb' 'ccc'
- *
- * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
- * but using group=1, the output would be: bbb and ccc (no ' marks)
- *
- * NOTE: This Tokenizer does not output tokens that are of zero length.
- *
- * @see Pattern
- */
-public final class PatternTokenizer extends Tokenizer {
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
- private final StringBuilder str = new StringBuilder();
- private int index;
-
- private final Pattern pattern;
- private final int group;
- private final Matcher matcher;
-
- /**
- * creates a new PatternTokenizer returning tokens from group (-1 for split functionality)
- */
- public PatternTokenizer(Reader input, Pattern pattern, int group) throws IOException {
- super(input);
- this.pattern = pattern;
- this.group = group;
- fillBuffer(str, input);
- matcher = pattern.matcher(str);
- index = 0;
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (index >= str.length()) return false;
- clearAttributes();
- if (group >= 0) {
-
- // match a specific group
- while (matcher.find()) {
- index = matcher.start(group);
- final int endIndex = matcher.end(group);
- if (index == endIndex) continue;
- termAtt.setEmpty().append(str, index, endIndex);
- offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex));
- return true;
- }
-
- index = Integer.MAX_VALUE; // mark exhausted
- return false;
-
- } else {
-
- // String.split() functionality
- while (matcher.find()) {
- if (matcher.start() - index > 0) {
- // found a non-zero-length token
- termAtt.setEmpty().append(str, index, matcher.start());
- offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start()));
- index = matcher.end();
- return true;
- }
-
- index = matcher.end();
- }
-
- if (str.length() - index == 0) {
- index = Integer.MAX_VALUE; // mark exhausted
- return false;
- }
-
- termAtt.setEmpty().append(str, index, str.length());
- offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
- index = Integer.MAX_VALUE; // mark exhausted
- return true;
- }
- }
-
- @Override
- public void end() throws IOException {
- final int ofs = correctOffset(str.length());
- offsetAtt.setOffset(ofs, ofs);
- }
-
- @Override
- public void reset(Reader input) throws IOException {
- super.reset(input);
- fillBuffer(str, input);
- matcher.reset(str);
- index = 0;
- }
-
- // TODO: we should see if we can make this tokenizer work without reading
- // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ?
- final char[] buffer = new char[8192];
-
- private void fillBuffer(StringBuilder sb, Reader input) throws IOException {
- int len;
- sb.setLength(0);
- while ((len = input.read(buffer)) > 0) {
- sb.append(buffer, 0, len);
- }
- }
-}