diff --git a/CHANGES.txt b/CHANGES.txt
index afa610793cc..083548ec326 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -236,7 +236,7 @@ API Changes
NumericRangeQuery and its new indexing format for numeric or
date values. (Uwe Schindler)
-23. LUCENE-1630: Deprecate Weight in favor of QueryWeight, which adds
+24. LUCENE-1630: Deprecate Weight in favor of QueryWeight, which adds
a scorer(IndexReader, boolean /* scoreDocsInOrder */, boolean /*
topScorer */) method instead of scorer(IndexReader) (now
deprecated). The new method is used by IndexSearcher to mate
@@ -254,6 +254,11 @@ API Changes
out of order when used with a Collector that can accept docs out
of order. (Shai Erera via Mike McCandless)
+25. LUCENE-1466: Changed Tokenizer.input to be a CharStream; added
+ CharFilter and MappingCharFilter, which allows chaining & mapping
+ of characters before tokenizers run. (Koji Sekiguchi via Mike
+ McCandless)
+
Bug fixes
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
diff --git a/common-build.xml b/common-build.xml
index df1d631b1e8..f4ca41faba3 100644
--- a/common-build.xml
+++ b/common-build.xml
@@ -42,7 +42,7 @@
-
+
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
index 3b6609b94d0..3f218fd72a8 100644
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
@@ -96,7 +96,7 @@ public final class CJKTokenizer extends Tokenizer {
* @param in I/O reader
*/
public CJKTokenizer(Reader in) {
- input = in;
+ super(in);
}
//~ Methods ----------------------------------------------------------------
@@ -253,7 +253,7 @@ public final class CJKTokenizer extends Tokenizer {
if (length > 0) {
return reusableToken.reinit
- (buffer, 0, length, start, start+length, TOKEN_TYPE_NAMES[tokenType]);
+ (buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length), TOKEN_TYPE_NAMES[tokenType]);
} else if (dataLen == -1) {
return null;
}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
index 92048fadd44..f9a1aec8fff 100644
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
@@ -55,7 +55,7 @@ public final class ChineseTokenizer extends Tokenizer {
public ChineseTokenizer(Reader in) {
- input = in;
+ super(in);
}
private int offset = 0, bufferIndex=0, dataLen=0;
@@ -81,7 +81,7 @@ public final class ChineseTokenizer extends Tokenizer {
if (length>0) {
//System.out.println(new String(buffer, 0,
//length));
- return token.reinit(buffer, 0, length, start, start+length);
+ return token.reinit(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length));
}
else
return null;
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
index 4d92aef85f9..e9a380b6ce9 100644
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
@@ -45,6 +45,7 @@ public class SentenceTokenizer extends Tokenizer {
private Token t = new Token();
public SentenceTokenizer(Reader reader) {
+ super(reader);
bufferInput = new BufferedReader(reader, 2048);
}
@@ -91,7 +92,7 @@ public class SentenceTokenizer extends Tokenizer {
return null;
else {
t.clear();
- t.reinit(buffer.toString(), tokenStart, tokenEnd, "sentence");
+ t.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence");
return t;
}
}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
index 6d1990d90e2..e6fe22b02a8 100644
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@@ -140,8 +140,8 @@ public class EdgeNGramTokenizer extends Tokenizer {
int start = side == Side.FRONT ? 0 : inLen - gramSize;
int end = start + gramSize;
reusableToken.setTermBuffer(inStr, start, gramSize);
- reusableToken.setStartOffset(start);
- reusableToken.setEndOffset(end);
+ reusableToken.setStartOffset(input.correctOffset(start));
+ reusableToken.setEndOffset(input.correctOffset(end));
gramSize++;
return reusableToken;
}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
index 403eb7f6d59..9bfb4d309e7 100644
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@@ -85,6 +85,6 @@ public class NGramTokenizer extends Tokenizer {
int oldPos = pos;
pos++;
- return reusableToken.reinit(inStr, oldPos, gramSize, oldPos, oldPos+gramSize);
+ return reusableToken.reinit(inStr, oldPos, gramSize, input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
}
}
diff --git a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
index 7eff5aab0ac..2e1984a1543 100644
--- a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
+++ b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
@@ -17,6 +17,7 @@
package org.apache.lucene.wikipedia.analysis;
+import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
@@ -107,7 +108,7 @@ public class WikipediaTokenizer extends Tokenizer {
private Iterator tokens = null;
void setInput(Reader reader) {
- this.input = reader;
+ this.input = CharReader.get(reader);
}
/**
@@ -190,8 +191,8 @@ public class WikipediaTokenizer extends Tokenizer {
//trim the buffer
String s = buffer.toString().trim();
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
- reusableToken.setStartOffset(theStart);
- reusableToken.setEndOffset(theStart + s.length());
+ reusableToken.setStartOffset(input.correctOffset(theStart));
+ reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
@@ -229,8 +230,8 @@ public class WikipediaTokenizer extends Tokenizer {
//trim the buffer
String s = buffer.toString().trim();
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
- reusableToken.setStartOffset(theStart);
- reusableToken.setEndOffset(theStart + s.length());
+ reusableToken.setStartOffset(input.correctOffset(theStart));
+ reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
@@ -243,8 +244,8 @@ public class WikipediaTokenizer extends Tokenizer {
private void setupToken(final Token reusableToken) {
scanner.getText(reusableToken);
final int start = scanner.yychar();
- reusableToken.setStartOffset(start);
- reusableToken.setEndOffset(start + reusableToken.termLength());
+ reusableToken.setStartOffset(input.correctOffset(start));
+ reusableToken.setEndOffset(input.correctOffset(start + reusableToken.termLength()));
}
/*
@@ -258,7 +259,7 @@ public class WikipediaTokenizer extends Tokenizer {
}
public void reset(Reader reader) throws IOException {
- input = reader;
+ setInput(reader);
reset();
}
diff --git a/src/java/org/apache/lucene/analysis/BaseCharFilter.java b/src/java/org/apache/lucene/analysis/BaseCharFilter.java
new file mode 100644
index 00000000000..124a324c2bb
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/BaseCharFilter.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Base utility class for implementing a {@link
+ * CharFilter}. You record mappings by calling {@link
+ * #addOffCorrectMap}, and then invoke the correct method.
+ * This class is not particularly efficient, eg a new class
+ * instance is created for every call to {@link
+ * #addOffCorrectMap}, which is appended to a private list.
+ * When retrieving a mapping, that list is linearly
+ * checked.
+ * @version $Id$
+ */
+public abstract class BaseCharFilter extends CharFilter {
+
+ //private List pcmList;
+ private List pcmList;
+
+ public BaseCharFilter(CharStream in) {
+ super(in);
+ }
+
+ /** Retrieve the corrected offset. Note that this method
+ * is slow if you correct positions far before the most
+ * recently added position. */
+ protected int correct(int currentOff) {
+ if (pcmList == null || pcmList.isEmpty()) {
+ return currentOff;
+ }
+ for (int i = pcmList.size() - 1; i >= 0; i--) {
+ if (currentOff >= ((OffCorrectMap) pcmList.get(i)).off) {
+ return currentOff + ((OffCorrectMap) pcmList.get(i)).cumulativeDiff;
+ }
+ }
+ return currentOff;
+ }
+
+ protected int getLastCumulativeDiff() {
+ return pcmList == null || pcmList.isEmpty() ?
+ 0 : ((OffCorrectMap)pcmList.get(pcmList.size() - 1)).cumulativeDiff;
+ }
+
+ protected void addOffCorrectMap(int off, int cumulativeDiff) {
+ if (pcmList == null) {
+ pcmList = new ArrayList();
+ }
+ pcmList.add(new OffCorrectMap(off, cumulativeDiff));
+ }
+
+ static class OffCorrectMap {
+
+ int off;
+ int cumulativeDiff;
+
+ OffCorrectMap(int off, int cumulativeDiff) {
+ this.off = off;
+ this.cumulativeDiff = cumulativeDiff;
+ }
+
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ sb.append('(');
+ sb.append(off);
+ sb.append(',');
+ sb.append(cumulativeDiff);
+ sb.append(')');
+ return sb.toString();
+ }
+ }
+}
diff --git a/src/java/org/apache/lucene/analysis/CharFilter.java b/src/java/org/apache/lucene/analysis/CharFilter.java
new file mode 100644
index 00000000000..20cf1d29f6a
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/CharFilter.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+
+/**
+ * Subclasses of CharFilter can be chained to filter CharStream.
+ *
+ * @version $Id$
+ *
+ */
+public abstract class CharFilter extends CharStream {
+
+ protected CharStream input;
+
+ protected CharFilter(CharStream in) {
+ input = in;
+ }
+
+ /**
+ * Subclass may want to override to correct the current offset.
+ *
+ * @param currentOff current offset
+ * @return corrected offset
+ */
+ protected int correct(int currentOff) {
+ return currentOff;
+ }
+
+ /**
+ * Chains the corrected offset through the input
+ * CharFilter.
+ */
+ public final int correctOffset(int currentOff) {
+ return input.correctOffset(correct(currentOff));
+ }
+
+ public void close() throws IOException {
+ input.close();
+ }
+
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ return input.read(cbuf, off, len);
+ }
+}
diff --git a/src/java/org/apache/lucene/analysis/CharReader.java b/src/java/org/apache/lucene/analysis/CharReader.java
new file mode 100644
index 00000000000..30c38612913
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/CharReader.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * CharReader is a Reader wrapper. It reads chars from Reader and outputs CharStream.
+ *
+ * @version $Id$
+ *
+ */
+public final class CharReader extends CharStream {
+
+ protected Reader input;
+
+ public static CharStream get(Reader input) {
+ return input instanceof CharStream ?
+ (CharStream)input : new CharReader(input);
+ }
+
+ private CharReader(Reader in) {
+ input = in;
+ }
+
+ public int correctOffset(int currentOff) {
+ return currentOff;
+ }
+
+ public void close() throws IOException {
+ input.close();
+ }
+
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ return input.read(cbuf, off, len);
+ }
+}
diff --git a/src/java/org/apache/lucene/analysis/CharStream.java b/src/java/org/apache/lucene/analysis/CharStream.java
new file mode 100644
index 00000000000..2d10a3d8251
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/CharStream.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.Reader;
+
+/**
+ * CharStream adds correctOffset functionality over Reader.
+ *
+ * @version $Id$
+ */
+public abstract class CharStream extends Reader {
+
+ /**
+ * Called by CharFilter(s) and Tokenizer to correct token offset.
+ *
+ * @param currentOff current offset
+ * @return corrected token offset
+ */
+ public abstract int correctOffset(int currentOff);
+}
diff --git a/src/java/org/apache/lucene/analysis/CharTokenizer.java b/src/java/org/apache/lucene/analysis/CharTokenizer.java
index dc80aac30f8..fc934a0ee24 100644
--- a/src/java/org/apache/lucene/analysis/CharTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java
@@ -90,7 +90,7 @@ public abstract class CharTokenizer extends Tokenizer {
}
termAtt.setTermLength(length);
- offsetAtt.setOffset(start, start+length);
+ offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
return true;
}
@@ -134,8 +134,8 @@ public abstract class CharTokenizer extends Tokenizer {
}
reusableToken.setTermLength(length);
- reusableToken.setStartOffset(start);
- reusableToken.setEndOffset(start+length);
+ reusableToken.setStartOffset(input.correctOffset(start));
+ reusableToken.setEndOffset(input.correctOffset(start+length));
return reusableToken;
}
diff --git a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
index b367a304045..5f1cac85207 100644
--- a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
@@ -59,7 +59,7 @@ public class KeywordTokenizer extends Tokenizer {
buffer = termAtt.resizeTermBuffer(1+buffer.length);
}
termAtt.setTermLength(upto);
- offsetAtt.setOffset(0, upto);
+ offsetAtt.setOffset(input.correctOffset(0), input.correctOffset(upto));
return true;
}
return false;
@@ -81,8 +81,8 @@ public class KeywordTokenizer extends Tokenizer {
buffer = reusableToken.resizeTermBuffer(1+buffer.length);
}
reusableToken.setTermLength(upto);
- reusableToken.setStartOffset(0);
- reusableToken.setEndOffset(upto);
+ reusableToken.setStartOffset(input.correctOffset(0));
+ reusableToken.setEndOffset(input.correctOffset(upto));
return reusableToken;
}
diff --git a/src/java/org/apache/lucene/analysis/MappingCharFilter.java b/src/java/org/apache/lucene/analysis/MappingCharFilter.java
new file mode 100644
index 00000000000..0e89e571d4c
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/MappingCharFilter.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+/**
+ * {@link CharFilter} that applies the mappings contained in
+ * a {@link NormalizeCharMap} to the character stream.
+ *
+ * @version $Id$
+ */
+public class MappingCharFilter extends BaseCharFilter {
+
+ private final NormalizeCharMap normMap;
+ //private LinkedList buffer;
+ private LinkedList buffer;
+ private String replacement;
+ private int charPointer;
+ private int nextCharCounter;
+
+ public MappingCharFilter(NormalizeCharMap normMap, CharStream in) {
+ super(in);
+ this.normMap = normMap;
+ }
+
+ public int read() throws IOException {
+ while(true) {
+ if (replacement != null && charPointer < replacement.length()) {
+ return replacement.charAt(charPointer++);
+ }
+
+ int firstChar = nextChar();
+ if (firstChar == -1) return -1;
+ NormalizeCharMap nm = normMap.submap != null ?
+ (NormalizeCharMap)normMap.submap.get(Character.valueOf((char) firstChar)) : null;
+ if (nm == null) return firstChar;
+ NormalizeCharMap result = match(nm);
+ if (result == null) return firstChar;
+ replacement = result.normStr;
+ charPointer = 0;
+ if (result.diff != 0) {
+ int prevCumulativeDiff = getLastCumulativeDiff();
+ if (result.diff < 0) {
+ for(int i = 0; i < -result.diff ; i++)
+ addOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i);
+ } else {
+ addOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff);
+ }
+ }
+ }
+ }
+
+ private int nextChar() throws IOException {
+ nextCharCounter++;
+ if (buffer != null && !buffer.isEmpty()) {
+ return ((Character)buffer.removeFirst()).charValue();
+ }
+ return input.read();
+ }
+
+ private void pushChar(int c) {
+ nextCharCounter--;
+ if(buffer == null)
+ buffer = new LinkedList();
+ buffer.addFirst(new Character((char) c));
+ }
+
+ private void pushLastChar(int c) {
+ if (buffer == null) {
+ buffer = new LinkedList();
+ }
+ buffer.addLast(new Character((char) c));
+ }
+
+ private NormalizeCharMap match(NormalizeCharMap map) throws IOException {
+ NormalizeCharMap result = null;
+ if (map.submap != null) {
+ int chr = nextChar();
+ if (chr != -1) {
+ NormalizeCharMap subMap = (NormalizeCharMap) map.submap.get(Character.valueOf((char) chr));
+ if (subMap != null) {
+ result = match(subMap);
+ }
+ if (result == null) {
+ pushChar(chr);
+ }
+ }
+ }
+ if (result == null && map.normStr != null) {
+ result = map;
+ }
+ return result;
+ }
+
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ char[] tmp = new char[len];
+ int l = input.read(tmp, 0, len);
+ if (l != -1) {
+ for(int i = 0; i < l; i++)
+ pushLastChar(tmp[i]);
+ }
+ l = 0;
+ for(int i = off; i < off + len; i++) {
+ int c = read();
+ if (c == -1) break;
+ cbuf[i] = (char) c;
+ l++;
+ }
+ return l == 0 ? -1 : l;
+ }
+
+ public boolean markSupported() {
+ return false;
+ }
+
+ public void mark(int readAheadLimit) throws IOException {
+ throw new IOException("mark/reset not supported");
+ }
+
+ public void reset() throws IOException {
+ throw new IOException("mark/reset not supported");
+ }
+}
diff --git a/src/java/org/apache/lucene/analysis/NormalizeCharMap.java b/src/java/org/apache/lucene/analysis/NormalizeCharMap.java
new file mode 100644
index 00000000000..c60449c39a9
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/NormalizeCharMap.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Holds a map of String input to String output, to be used
+ * with {@link MappingCharFilter}.
+ * @version $Id$
+ */
+public class NormalizeCharMap {
+
+ //Map submap;
+ Map submap;
+ String normStr;
+ int diff;
+
+ public void add(String singleMatch, String replacement) {
+ NormalizeCharMap currMap = this;
+ for(int i = 0; i < singleMatch.length(); i++) {
+ char c = singleMatch.charAt(i);
+ if (currMap.submap == null) {
+ currMap.submap = new HashMap(1);
+ }
+ NormalizeCharMap map = (NormalizeCharMap) currMap.submap.get(Character.valueOf(c));
+ if (map == null) {
+ map = new NormalizeCharMap();
+ currMap.submap.put(new Character(c), map);
+ }
+ currMap = map;
+ }
+ if (currMap.normStr != null) {
+ throw new RuntimeException("MappingCharFilter: there is already a mapping for " + singleMatch);
+ }
+ currMap.normStr = replacement;
+ currMap.diff = singleMatch.length() - replacement.length();
+ }
+}
diff --git a/src/java/org/apache/lucene/analysis/Tokenizer.java b/src/java/org/apache/lucene/analysis/Tokenizer.java
index 1222e73761d..e2525cf75ec 100644
--- a/src/java/org/apache/lucene/analysis/Tokenizer.java
+++ b/src/java/org/apache/lucene/analysis/Tokenizer.java
@@ -45,16 +45,20 @@ import java.io.IOException;
public abstract class Tokenizer extends TokenStream {
/** The text source for this Tokenizer. */
- protected Reader input;
+ protected CharStream input;
/** Construct a tokenizer with null input. */
protected Tokenizer() {}
/** Construct a token stream processing the given input. */
protected Tokenizer(Reader input) {
- this.input = input;
+ this.input = CharReader.get(input);
}
+ protected Tokenizer(CharStream input) {
+ this.input = input;
+ }
+
/** By default, closes the input Reader. */
public void close() throws IOException {
input.close();
@@ -64,6 +68,10 @@ public abstract class Tokenizer extends TokenStream {
* analyzer (in its reusableTokenStream method) will use
* this to re-use a previously created tokenizer. */
public void reset(Reader input) throws IOException {
+ this.input = CharReader.get(input);
+ }
+
+ public void reset(CharStream input) throws IOException {
this.input = input;
}
}
diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
index e1885abc473..61c3fc80afe 100644
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.standard;
import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@@ -91,7 +92,7 @@ public class StandardTokenizer extends Tokenizer {
private boolean replaceInvalidAcronym;
void setInput(Reader reader) {
- this.input = reader;
+ input = CharReader.get(reader);
}
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
@@ -126,7 +127,7 @@ public class StandardTokenizer extends Tokenizer {
*/
public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
this.replaceInvalidAcronym = replaceInvalidAcronym;
- this.input = input;
+ setInput(input);
this.scanner = new StandardTokenizerImpl(input);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
@@ -161,7 +162,7 @@ public class StandardTokenizer extends Tokenizer {
posIncrAtt.setPositionIncrement(posIncr);
scanner.getText(termAtt);
final int start = scanner.yychar();
- offsetAtt.setOffset(start, start+termAtt.termLength());
+ offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+termAtt.termLength()));
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
@@ -194,19 +195,19 @@ public class StandardTokenizer extends Tokenizer {
int posIncr = 1;
while(true) {
- int tokenType = scanner.getNextToken();
+ int tokenType = scanner.getNextToken();
- if (tokenType == StandardTokenizerImpl.YYEOF) {
- return null;
- }
+ if (tokenType == StandardTokenizerImpl.YYEOF) {
+ return null;
+ }
if (scanner.yylength() <= maxTokenLength) {
reusableToken.clear();
reusableToken.setPositionIncrement(posIncr);
scanner.getText(reusableToken);
final int start = scanner.yychar();
- reusableToken.setStartOffset(start);
- reusableToken.setEndOffset(start+reusableToken.termLength());
+ reusableToken.setStartOffset(input.correctOffset(start));
+ reusableToken.setEndOffset(input.correctOffset(start+reusableToken.termLength()));
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
@@ -234,13 +235,13 @@ public class StandardTokenizer extends Tokenizer {
* @see org.apache.lucene.analysis.TokenStream#reset()
*/
public void reset() throws IOException {
- super.reset();
- scanner.yyreset(input);
+ super.reset();
+ scanner.yyreset(input);
}
public void reset(Reader reader) throws IOException {
- input = reader;
- reset();
+ setInput(reader);
+ reset();
}
/**