From 87de0c968849462aed4e62627956e0c091c7b401 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 23 Jun 2009 19:15:31 +0000 Subject: [PATCH] LUCENE-1466: added chainable CharFilter stage before Tokenizer to allow mapping of characters before tokenization git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787795 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 7 +- common-build.xml | 2 +- .../lucene/analysis/cjk/CJKTokenizer.java | 4 +- .../lucene/analysis/cn/ChineseTokenizer.java | 4 +- .../analysis/cn/smart/SentenceTokenizer.java | 3 +- .../analysis/ngram/EdgeNGramTokenizer.java | 4 +- .../lucene/analysis/ngram/NGramTokenizer.java | 2 +- .../analysis/WikipediaTokenizer.java | 17 ++- .../lucene/analysis/BaseCharFilter.java | 90 +++++++++++ .../apache/lucene/analysis/CharFilter.java | 61 ++++++++ .../apache/lucene/analysis/CharReader.java | 53 +++++++ .../apache/lucene/analysis/CharStream.java | 36 +++++ .../apache/lucene/analysis/CharTokenizer.java | 6 +- .../lucene/analysis/KeywordTokenizer.java | 6 +- .../lucene/analysis/MappingCharFilter.java | 140 ++++++++++++++++++ .../lucene/analysis/NormalizeCharMap.java | 55 +++++++ .../org/apache/lucene/analysis/Tokenizer.java | 12 +- .../analysis/standard/StandardTokenizer.java | 27 ++-- 18 files changed, 490 insertions(+), 39 deletions(-) create mode 100644 src/java/org/apache/lucene/analysis/BaseCharFilter.java create mode 100644 src/java/org/apache/lucene/analysis/CharFilter.java create mode 100644 src/java/org/apache/lucene/analysis/CharReader.java create mode 100644 src/java/org/apache/lucene/analysis/CharStream.java create mode 100644 src/java/org/apache/lucene/analysis/MappingCharFilter.java create mode 100644 src/java/org/apache/lucene/analysis/NormalizeCharMap.java diff --git a/CHANGES.txt b/CHANGES.txt index afa610793cc..083548ec326 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -236,7 +236,7 @@ API Changes NumericRangeQuery and its new indexing format for numeric or date values. (Uwe Schindler) -23. LUCENE-1630: Deprecate Weight in favor of QueryWeight, which adds +24. LUCENE-1630: Deprecate Weight in favor of QueryWeight, which adds a scorer(IndexReader, boolean /* scoreDocsInOrder */, boolean /* topScorer */) method instead of scorer(IndexReader) (now deprecated). The new method is used by IndexSearcher to mate @@ -254,6 +254,11 @@ API Changes out of order when used with a Collector that can accept docs out of order. (Shai Erera via Mike McCandless) +25. LUCENE-1466: Changed Tokenizer.input to be a CharStream; added + CharFilter and MappingCharFilter, which allows chaining & mapping + of characters before tokenizers run. (Koji Sekiguchi via Mike + McCandless) + Bug fixes 1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals() diff --git a/common-build.xml b/common-build.xml index df1d631b1e8..f4ca41faba3 100644 --- a/common-build.xml +++ b/common-build.xml @@ -42,7 +42,7 @@ - + diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java index 3b6609b94d0..3f218fd72a8 100644 --- a/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java @@ -96,7 +96,7 @@ public final class CJKTokenizer extends Tokenizer { * @param in I/O reader */ public CJKTokenizer(Reader in) { - input = in; + super(in); } //~ Methods ---------------------------------------------------------------- @@ -253,7 +253,7 @@ public final class CJKTokenizer extends Tokenizer { if (length > 0) { return reusableToken.reinit - (buffer, 0, length, start, start+length, TOKEN_TYPE_NAMES[tokenType]); + (buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length), TOKEN_TYPE_NAMES[tokenType]); } else if (dataLen == -1) { return null; } diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java index 92048fadd44..f9a1aec8fff 100644 --- a/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java @@ -55,7 +55,7 @@ public final class ChineseTokenizer extends Tokenizer { public ChineseTokenizer(Reader in) { - input = in; + super(in); } private int offset = 0, bufferIndex=0, dataLen=0; @@ -81,7 +81,7 @@ public final class ChineseTokenizer extends Tokenizer { if (length>0) { //System.out.println(new String(buffer, 0, //length)); - return token.reinit(buffer, 0, length, start, start+length); + return token.reinit(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length)); } else return null; diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java index 4d92aef85f9..e9a380b6ce9 100644 --- a/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java @@ -45,6 +45,7 @@ public class SentenceTokenizer extends Tokenizer { private Token t = new Token(); public SentenceTokenizer(Reader reader) { + super(reader); bufferInput = new BufferedReader(reader, 2048); } @@ -91,7 +92,7 @@ public class SentenceTokenizer extends Tokenizer { return null; else { t.clear(); - t.reinit(buffer.toString(), tokenStart, tokenEnd, "sentence"); + t.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence"); return t; } } diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java index 6d1990d90e2..e6fe22b02a8 100644 --- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java @@ -140,8 +140,8 @@ public class EdgeNGramTokenizer extends Tokenizer { int start = side == Side.FRONT ? 0 : inLen - gramSize; int end = start + gramSize; reusableToken.setTermBuffer(inStr, start, gramSize); - reusableToken.setStartOffset(start); - reusableToken.setEndOffset(end); + reusableToken.setStartOffset(input.correctOffset(start)); + reusableToken.setEndOffset(input.correctOffset(end)); gramSize++; return reusableToken; } diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java index 403eb7f6d59..9bfb4d309e7 100644 --- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java @@ -85,6 +85,6 @@ public class NGramTokenizer extends Tokenizer { int oldPos = pos; pos++; - return reusableToken.reinit(inStr, oldPos, gramSize, oldPos, oldPos+gramSize); + return reusableToken.reinit(inStr, oldPos, gramSize, input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize)); } } diff --git a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java index 7eff5aab0ac..2e1984a1543 100644 --- a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java +++ b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java @@ -17,6 +17,7 @@ package org.apache.lucene.wikipedia.analysis; +import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; @@ -107,7 +108,7 @@ public class WikipediaTokenizer extends Tokenizer { private Iterator tokens = null; void setInput(Reader reader) { - this.input = reader; + this.input = CharReader.get(reader); } /** @@ -190,8 +191,8 @@ public class WikipediaTokenizer extends Tokenizer { //trim the buffer String s = buffer.toString().trim(); reusableToken.setTermBuffer(s.toCharArray(), 0, s.length()); - reusableToken.setStartOffset(theStart); - reusableToken.setEndOffset(theStart + s.length()); + reusableToken.setStartOffset(input.correctOffset(theStart)); + reusableToken.setEndOffset(input.correctOffset(theStart + s.length())); reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ @@ -229,8 +230,8 @@ public class WikipediaTokenizer extends Tokenizer { //trim the buffer String s = buffer.toString().trim(); reusableToken.setTermBuffer(s.toCharArray(), 0, s.length()); - reusableToken.setStartOffset(theStart); - reusableToken.setEndOffset(theStart + s.length()); + reusableToken.setStartOffset(input.correctOffset(theStart)); + reusableToken.setEndOffset(input.correctOffset(theStart + s.length())); reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ @@ -243,8 +244,8 @@ public class WikipediaTokenizer extends Tokenizer { private void setupToken(final Token reusableToken) { scanner.getText(reusableToken); final int start = scanner.yychar(); - reusableToken.setStartOffset(start); - reusableToken.setEndOffset(start + reusableToken.termLength()); + reusableToken.setStartOffset(input.correctOffset(start)); + reusableToken.setEndOffset(input.correctOffset(start + reusableToken.termLength())); } /* @@ -258,7 +259,7 @@ public class WikipediaTokenizer extends Tokenizer { } public void reset(Reader reader) throws IOException { - input = reader; + setInput(reader); reset(); } diff --git a/src/java/org/apache/lucene/analysis/BaseCharFilter.java b/src/java/org/apache/lucene/analysis/BaseCharFilter.java new file mode 100644 index 00000000000..124a324c2bb --- /dev/null +++ b/src/java/org/apache/lucene/analysis/BaseCharFilter.java @@ -0,0 +1,90 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.util.ArrayList; +import java.util.List; + +/** + * Base utility class for implementing a {@link + * CharFilter}. You record mappings by calling {@link + * #addOffCorrectMap}, and then invoke the correct method. + * This class is not particularly efficient, eg a new class + * instance is created for every call to {@link + * #addOffCorrectMap}, which is appended to a private list. + * When retrieving a mapping, that list is linearly + * checked. + * @version $Id$ + */ +public abstract class BaseCharFilter extends CharFilter { + + //private List pcmList; + private List pcmList; + + public BaseCharFilter(CharStream in) { + super(in); + } + + /** Retrieve the corrected offset. Note that this method + * is slow if you correct positions far before the most + * recently added position. */ + protected int correct(int currentOff) { + if (pcmList == null || pcmList.isEmpty()) { + return currentOff; + } + for (int i = pcmList.size() - 1; i >= 0; i--) { + if (currentOff >= ((OffCorrectMap) pcmList.get(i)).off) { + return currentOff + ((OffCorrectMap) pcmList.get(i)).cumulativeDiff; + } + } + return currentOff; + } + + protected int getLastCumulativeDiff() { + return pcmList == null || pcmList.isEmpty() ? + 0 : ((OffCorrectMap)pcmList.get(pcmList.size() - 1)).cumulativeDiff; + } + + protected void addOffCorrectMap(int off, int cumulativeDiff) { + if (pcmList == null) { + pcmList = new ArrayList(); + } + pcmList.add(new OffCorrectMap(off, cumulativeDiff)); + } + + static class OffCorrectMap { + + int off; + int cumulativeDiff; + + OffCorrectMap(int off, int cumulativeDiff) { + this.off = off; + this.cumulativeDiff = cumulativeDiff; + } + + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('('); + sb.append(off); + sb.append(','); + sb.append(cumulativeDiff); + sb.append(')'); + return sb.toString(); + } + } +} diff --git a/src/java/org/apache/lucene/analysis/CharFilter.java b/src/java/org/apache/lucene/analysis/CharFilter.java new file mode 100644 index 00000000000..20cf1d29f6a --- /dev/null +++ b/src/java/org/apache/lucene/analysis/CharFilter.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; + +/** + * Subclasses of CharFilter can be chained to filter CharStream. + * + * @version $Id$ + * + */ +public abstract class CharFilter extends CharStream { + + protected CharStream input; + + protected CharFilter(CharStream in) { + input = in; + } + + /** + * Subclass may want to override to correct the current offset. + * + * @param currentOff current offset + * @return corrected offset + */ + protected int correct(int currentOff) { + return currentOff; + } + + /** + * Chains the corrected offset through the input + * CharFilter. + */ + public final int correctOffset(int currentOff) { + return input.correctOffset(correct(currentOff)); + } + + public void close() throws IOException { + input.close(); + } + + public int read(char[] cbuf, int off, int len) throws IOException { + return input.read(cbuf, off, len); + } +} diff --git a/src/java/org/apache/lucene/analysis/CharReader.java b/src/java/org/apache/lucene/analysis/CharReader.java new file mode 100644 index 00000000000..30c38612913 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/CharReader.java @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.io.Reader; + +/** + * CharReader is a Reader wrapper. It reads chars from Reader and outputs CharStream. + * + * @version $Id$ + * + */ +public final class CharReader extends CharStream { + + protected Reader input; + + public static CharStream get(Reader input) { + return input instanceof CharStream ? + (CharStream)input : new CharReader(input); + } + + private CharReader(Reader in) { + input = in; + } + + public int correctOffset(int currentOff) { + return currentOff; + } + + public void close() throws IOException { + input.close(); + } + + public int read(char[] cbuf, int off, int len) throws IOException { + return input.read(cbuf, off, len); + } +} diff --git a/src/java/org/apache/lucene/analysis/CharStream.java b/src/java/org/apache/lucene/analysis/CharStream.java new file mode 100644 index 00000000000..2d10a3d8251 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/CharStream.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.Reader; + +/** + * CharStream adds correctOffset functionality over Reader. + * + * @version $Id$ + */ +public abstract class CharStream extends Reader { + + /** + * Called by CharFilter(s) and Tokenizer to correct token offset. + * + * @param currentOff current offset + * @return corrected token offset + */ + public abstract int correctOffset(int currentOff); +} diff --git a/src/java/org/apache/lucene/analysis/CharTokenizer.java b/src/java/org/apache/lucene/analysis/CharTokenizer.java index dc80aac30f8..fc934a0ee24 100644 --- a/src/java/org/apache/lucene/analysis/CharTokenizer.java +++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java @@ -90,7 +90,7 @@ public abstract class CharTokenizer extends Tokenizer { } termAtt.setTermLength(length); - offsetAtt.setOffset(start, start+length); + offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); return true; } @@ -134,8 +134,8 @@ public abstract class CharTokenizer extends Tokenizer { } reusableToken.setTermLength(length); - reusableToken.setStartOffset(start); - reusableToken.setEndOffset(start+length); + reusableToken.setStartOffset(input.correctOffset(start)); + reusableToken.setEndOffset(input.correctOffset(start+length)); return reusableToken; } diff --git a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java index b367a304045..5f1cac85207 100644 --- a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java +++ b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java @@ -59,7 +59,7 @@ public class KeywordTokenizer extends Tokenizer { buffer = termAtt.resizeTermBuffer(1+buffer.length); } termAtt.setTermLength(upto); - offsetAtt.setOffset(0, upto); + offsetAtt.setOffset(input.correctOffset(0), input.correctOffset(upto)); return true; } return false; @@ -81,8 +81,8 @@ public class KeywordTokenizer extends Tokenizer { buffer = reusableToken.resizeTermBuffer(1+buffer.length); } reusableToken.setTermLength(upto); - reusableToken.setStartOffset(0); - reusableToken.setEndOffset(upto); + reusableToken.setStartOffset(input.correctOffset(0)); + reusableToken.setEndOffset(input.correctOffset(upto)); return reusableToken; } diff --git a/src/java/org/apache/lucene/analysis/MappingCharFilter.java b/src/java/org/apache/lucene/analysis/MappingCharFilter.java new file mode 100644 index 00000000000..0e89e571d4c --- /dev/null +++ b/src/java/org/apache/lucene/analysis/MappingCharFilter.java @@ -0,0 +1,140 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.util.LinkedList; + +/** + * {@link CharFilter} that applies the mappings contained in + * a {@link NormalizeCharMap} to the character stream. + * + * @version $Id$ + */ +public class MappingCharFilter extends BaseCharFilter { + + private final NormalizeCharMap normMap; + //private LinkedList buffer; + private LinkedList buffer; + private String replacement; + private int charPointer; + private int nextCharCounter; + + public MappingCharFilter(NormalizeCharMap normMap, CharStream in) { + super(in); + this.normMap = normMap; + } + + public int read() throws IOException { + while(true) { + if (replacement != null && charPointer < replacement.length()) { + return replacement.charAt(charPointer++); + } + + int firstChar = nextChar(); + if (firstChar == -1) return -1; + NormalizeCharMap nm = normMap.submap != null ? + (NormalizeCharMap)normMap.submap.get(Character.valueOf((char) firstChar)) : null; + if (nm == null) return firstChar; + NormalizeCharMap result = match(nm); + if (result == null) return firstChar; + replacement = result.normStr; + charPointer = 0; + if (result.diff != 0) { + int prevCumulativeDiff = getLastCumulativeDiff(); + if (result.diff < 0) { + for(int i = 0; i < -result.diff ; i++) + addOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i); + } else { + addOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff); + } + } + } + } + + private int nextChar() throws IOException { + nextCharCounter++; + if (buffer != null && !buffer.isEmpty()) { + return ((Character)buffer.removeFirst()).charValue(); + } + return input.read(); + } + + private void pushChar(int c) { + nextCharCounter--; + if(buffer == null) + buffer = new LinkedList(); + buffer.addFirst(new Character((char) c)); + } + + private void pushLastChar(int c) { + if (buffer == null) { + buffer = new LinkedList(); + } + buffer.addLast(new Character((char) c)); + } + + private NormalizeCharMap match(NormalizeCharMap map) throws IOException { + NormalizeCharMap result = null; + if (map.submap != null) { + int chr = nextChar(); + if (chr != -1) { + NormalizeCharMap subMap = (NormalizeCharMap) map.submap.get(Character.valueOf((char) chr)); + if (subMap != null) { + result = match(subMap); + } + if (result == null) { + pushChar(chr); + } + } + } + if (result == null && map.normStr != null) { + result = map; + } + return result; + } + + public int read(char[] cbuf, int off, int len) throws IOException { + char[] tmp = new char[len]; + int l = input.read(tmp, 0, len); + if (l != -1) { + for(int i = 0; i < l; i++) + pushLastChar(tmp[i]); + } + l = 0; + for(int i = off; i < off + len; i++) { + int c = read(); + if (c == -1) break; + cbuf[i] = (char) c; + l++; + } + return l == 0 ? -1 : l; + } + + public boolean markSupported() { + return false; + } + + public void mark(int readAheadLimit) throws IOException { + throw new IOException("mark/reset not supported"); + } + + public void reset() throws IOException { + throw new IOException("mark/reset not supported"); + } +} diff --git a/src/java/org/apache/lucene/analysis/NormalizeCharMap.java b/src/java/org/apache/lucene/analysis/NormalizeCharMap.java new file mode 100644 index 00000000000..c60449c39a9 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/NormalizeCharMap.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.util.HashMap; +import java.util.Map; + +/** + * Holds a map of String input to String output, to be used + * with {@link MappingCharFilter}. + * @version $Id$ + */ +public class NormalizeCharMap { + + //Map submap; + Map submap; + String normStr; + int diff; + + public void add(String singleMatch, String replacement) { + NormalizeCharMap currMap = this; + for(int i = 0; i < singleMatch.length(); i++) { + char c = singleMatch.charAt(i); + if (currMap.submap == null) { + currMap.submap = new HashMap(1); + } + NormalizeCharMap map = (NormalizeCharMap) currMap.submap.get(Character.valueOf(c)); + if (map == null) { + map = new NormalizeCharMap(); + currMap.submap.put(new Character(c), map); + } + currMap = map; + } + if (currMap.normStr != null) { + throw new RuntimeException("MappingCharFilter: there is already a mapping for " + singleMatch); + } + currMap.normStr = replacement; + currMap.diff = singleMatch.length() - replacement.length(); + } +} diff --git a/src/java/org/apache/lucene/analysis/Tokenizer.java b/src/java/org/apache/lucene/analysis/Tokenizer.java index 1222e73761d..e2525cf75ec 100644 --- a/src/java/org/apache/lucene/analysis/Tokenizer.java +++ b/src/java/org/apache/lucene/analysis/Tokenizer.java @@ -45,16 +45,20 @@ import java.io.IOException; public abstract class Tokenizer extends TokenStream { /** The text source for this Tokenizer. */ - protected Reader input; + protected CharStream input; /** Construct a tokenizer with null input. */ protected Tokenizer() {} /** Construct a token stream processing the given input. */ protected Tokenizer(Reader input) { - this.input = input; + this.input = CharReader.get(input); } + protected Tokenizer(CharStream input) { + this.input = input; + } + /** By default, closes the input Reader. */ public void close() throws IOException { input.close(); @@ -64,6 +68,10 @@ public abstract class Tokenizer extends TokenStream { * analyzer (in its reusableTokenStream method) will use * this to re-use a previously created tokenizer. */ public void reset(Reader input) throws IOException { + this.input = CharReader.get(input); + } + + public void reset(CharStream input) throws IOException { this.input = input; } } diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java index e1885abc473..61c3fc80afe 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java @@ -20,6 +20,7 @@ package org.apache.lucene.analysis.standard; import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -91,7 +92,7 @@ public class StandardTokenizer extends Tokenizer { private boolean replaceInvalidAcronym; void setInput(Reader reader) { - this.input = reader; + input = CharReader.get(reader); } private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; @@ -126,7 +127,7 @@ public class StandardTokenizer extends Tokenizer { */ public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) { this.replaceInvalidAcronym = replaceInvalidAcronym; - this.input = input; + setInput(input); this.scanner = new StandardTokenizerImpl(input); termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); @@ -161,7 +162,7 @@ public class StandardTokenizer extends Tokenizer { posIncrAtt.setPositionIncrement(posIncr); scanner.getText(termAtt); final int start = scanner.yychar(); - offsetAtt.setOffset(start, start+termAtt.termLength()); + offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+termAtt.termLength())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. @@ -194,19 +195,19 @@ public class StandardTokenizer extends Tokenizer { int posIncr = 1; while(true) { - int tokenType = scanner.getNextToken(); + int tokenType = scanner.getNextToken(); - if (tokenType == StandardTokenizerImpl.YYEOF) { - return null; - } + if (tokenType == StandardTokenizerImpl.YYEOF) { + return null; + } if (scanner.yylength() <= maxTokenLength) { reusableToken.clear(); reusableToken.setPositionIncrement(posIncr); scanner.getText(reusableToken); final int start = scanner.yychar(); - reusableToken.setStartOffset(start); - reusableToken.setEndOffset(start+reusableToken.termLength()); + reusableToken.setStartOffset(input.correctOffset(start)); + reusableToken.setEndOffset(input.correctOffset(start+reusableToken.termLength())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. @@ -234,13 +235,13 @@ public class StandardTokenizer extends Tokenizer { * @see org.apache.lucene.analysis.TokenStream#reset() */ public void reset() throws IOException { - super.reset(); - scanner.yyreset(input); + super.reset(); + scanner.yyreset(input); } public void reset(Reader reader) throws IOException { - input = reader; - reset(); + setInput(reader); + reset(); } /**