lucene 4: remove Pattern tokenizer and filter

2012-10-26 00:03:16 -04:00 · 2012-10-26 00:03:16 -04:00 · 6fad75df82
parent 097cb2dac7
commit 6fad75df82
2 changed files with 0 additions and 238 deletions
--- a/src/main/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
+++ b/src/main/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
@ -1,85 +0,0 @@
-/*
- * Licensed to ElasticSearch and Shay Banon under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. ElasticSearch licenses this
- * file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.lucene.analysis.pattern;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
-import java.io.IOException;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * A TokenFilter which applies a Pattern to each token in the stream,
- * replacing match occurances with the specified replacement string.
- * <p/>
- * <p>
- * <b>Note:</b> Depending on the input and the pattern used and the input
- * TokenStream, this TokenFilter may produce Tokens whose text is the empty
- * string.
- * </p>
- *
- * @see Pattern
- */
-public final class PatternReplaceFilter extends TokenFilter {
-    private final Pattern p;
-    private final String replacement;
-    private final boolean all;
-    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-    private final Matcher m;
-
-    /**
-     * Constructs an instance to replace either the first, or all occurances
-     *
-     * @param in          the TokenStream to process
-     * @param p           the patterm to apply to each Token
-     * @param replacement the "replacement string" to substitute, if null a
-     *                    blank string will be used. Note that this is not the literal
-     *                    string that will be used, '$' and '\' have special meaning.
-     * @param all         if true, all matches will be replaced otherwise just the first match.
-     * @see Matcher#quoteReplacement
-     */
-    public PatternReplaceFilter(TokenStream in,
-                                Pattern p,
-                                String replacement,
-                                boolean all) {
-        super(in);
-        this.p = p;
-        this.replacement = (null == replacement) ? "" : replacement;
-        this.all = all;
-        this.m = p.matcher(termAtt);
-    }
-
-    @Override
-    public boolean incrementToken() throws IOException {
-        if (!input.incrementToken()) return false;
-
-        m.reset();
-        if (m.find()) {
-            // replaceAll/replaceFirst will reset() this previous find.
-            String transformed = all ? m.replaceAll(replacement) : m.replaceFirst(replacement);
-            termAtt.setEmpty().append(transformed);
-        }
-
-        return true;
-    }
-
-}
--- a/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
+++ b/src/main/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
@ -1,153 +0,0 @@
-/*
- * Licensed to ElasticSearch and Shay Banon under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. ElasticSearch licenses this
- * file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.lucene.analysis.pattern;
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * This tokenizer uses regex pattern matching to construct distinct tokens
- * for the input stream.  It takes two arguments:  "pattern" and "group".
- * <p/>
- * <ul>
- * <li>"pattern" is the regular expression.</li>
- * <li>"group" says which group to extract into tokens.</li>
- * </ul>
- * <p>
- * group=-1 (the default) is equivalent to "split".  In this case, the tokens will
- * be equivalent to the output from (without empty tokens):
- * {@link String#split(java.lang.String)}
- * </p>
- * <p>
- * Using group >= 0 selects the matching group as the token.  For example, if you have:<br/>
- * <pre>
- *  pattern = \'([^\']+)\'
- *  group = 0
- *  input = aaa 'bbb' 'ccc'
- * </pre>
- * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks).  With the same input
- * but using group=1, the output would be: bbb and ccc (no ' marks)
- * </p>
- * <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
- *
- * @see Pattern
- */
-public final class PatternTokenizer extends Tokenizer {
-
-    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
-    private final StringBuilder str = new StringBuilder();
-    private int index;
-
-    private final Pattern pattern;
-    private final int group;
-    private final Matcher matcher;
-
-    /**
-     * creates a new PatternTokenizer returning tokens from group (-1 for split functionality)
-     */
-    public PatternTokenizer(Reader input, Pattern pattern, int group) throws IOException {
-        super(input);
-        this.pattern = pattern;
-        this.group = group;
-        fillBuffer(str, input);
-        matcher = pattern.matcher(str);
-        index = 0;
-    }
-
-    @Override
-    public boolean incrementToken() throws IOException {
-        if (index >= str.length()) return false;
-        clearAttributes();
-        if (group >= 0) {
-
-            // match a specific group
-            while (matcher.find()) {
-                index = matcher.start(group);
-                final int endIndex = matcher.end(group);
-                if (index == endIndex) continue;
-                termAtt.setEmpty().append(str, index, endIndex);
-                offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex));
-                return true;
-            }
-
-            index = Integer.MAX_VALUE; // mark exhausted
-            return false;
-
-        } else {
-
-            // String.split() functionality
-            while (matcher.find()) {
-                if (matcher.start() - index > 0) {
-                    // found a non-zero-length token
-                    termAtt.setEmpty().append(str, index, matcher.start());
-                    offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start()));
-                    index = matcher.end();
-                    return true;
-                }
-
-                index = matcher.end();
-            }
-
-            if (str.length() - index == 0) {
-                index = Integer.MAX_VALUE; // mark exhausted
-                return false;
-            }
-
-            termAtt.setEmpty().append(str, index, str.length());
-            offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
-            index = Integer.MAX_VALUE; // mark exhausted
-            return true;
-        }
-    }
-
-    @Override
-    public void end() throws IOException {
-        final int ofs = correctOffset(str.length());
-        offsetAtt.setOffset(ofs, ofs);
-    }
-
-    @Override
-    public void reset(Reader input) throws IOException {
-        super.reset(input);
-        fillBuffer(str, input);
-        matcher.reset(str);
-        index = 0;
-    }
-
-    // TODO: we should see if we can make this tokenizer work without reading
-    // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ?
-    final char[] buffer = new char[8192];
-
-    private void fillBuffer(StringBuilder sb, Reader input) throws IOException {
-        int len;
-        sb.setLength(0);
-        while ((len = input.read(buffer)) > 0) {
-            sb.append(buffer, 0, len);
-        }
-    }
-}