mirror of https://github.com/apache/lucene.git
LUCENE-1466: added chainable CharFilter stage before Tokenizer to allow mapping of characters before tokenization
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787795 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
75a11ef754
commit
87de0c9688
|
@ -236,7 +236,7 @@ API Changes
|
||||||
NumericRangeQuery and its new indexing format for numeric or
|
NumericRangeQuery and its new indexing format for numeric or
|
||||||
date values. (Uwe Schindler)
|
date values. (Uwe Schindler)
|
||||||
|
|
||||||
23. LUCENE-1630: Deprecate Weight in favor of QueryWeight, which adds
|
24. LUCENE-1630: Deprecate Weight in favor of QueryWeight, which adds
|
||||||
a scorer(IndexReader, boolean /* scoreDocsInOrder */, boolean /*
|
a scorer(IndexReader, boolean /* scoreDocsInOrder */, boolean /*
|
||||||
topScorer */) method instead of scorer(IndexReader) (now
|
topScorer */) method instead of scorer(IndexReader) (now
|
||||||
deprecated). The new method is used by IndexSearcher to mate
|
deprecated). The new method is used by IndexSearcher to mate
|
||||||
|
@ -254,6 +254,11 @@ API Changes
|
||||||
out of order when used with a Collector that can accept docs out
|
out of order when used with a Collector that can accept docs out
|
||||||
of order. (Shai Erera via Mike McCandless)
|
of order. (Shai Erera via Mike McCandless)
|
||||||
|
|
||||||
|
25. LUCENE-1466: Changed Tokenizer.input to be a CharStream; added
|
||||||
|
CharFilter and MappingCharFilter, which allows chaining & mapping
|
||||||
|
of characters before tokenizers run. (Koji Sekiguchi via Mike
|
||||||
|
McCandless)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
|
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
|
||||||
|
|
|
@ -42,7 +42,7 @@
|
||||||
<property name="Name" value="Lucene"/>
|
<property name="Name" value="Lucene"/>
|
||||||
<property name="dev.version" value="2.9-dev"/>
|
<property name="dev.version" value="2.9-dev"/>
|
||||||
<property name="version" value="${dev.version}"/>
|
<property name="version" value="${dev.version}"/>
|
||||||
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090623"/>
|
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090624"/>
|
||||||
<property name="spec.version" value="${version}"/>
|
<property name="spec.version" value="${version}"/>
|
||||||
<property name="year" value="2000-${current.year}"/>
|
<property name="year" value="2000-${current.year}"/>
|
||||||
<property name="final.name" value="lucene-${name}-${version}"/>
|
<property name="final.name" value="lucene-${name}-${version}"/>
|
||||||
|
|
|
@ -96,7 +96,7 @@ public final class CJKTokenizer extends Tokenizer {
|
||||||
* @param in I/O reader
|
* @param in I/O reader
|
||||||
*/
|
*/
|
||||||
public CJKTokenizer(Reader in) {
|
public CJKTokenizer(Reader in) {
|
||||||
input = in;
|
super(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
//~ Methods ----------------------------------------------------------------
|
//~ Methods ----------------------------------------------------------------
|
||||||
|
@ -253,7 +253,7 @@ public final class CJKTokenizer extends Tokenizer {
|
||||||
|
|
||||||
if (length > 0) {
|
if (length > 0) {
|
||||||
return reusableToken.reinit
|
return reusableToken.reinit
|
||||||
(buffer, 0, length, start, start+length, TOKEN_TYPE_NAMES[tokenType]);
|
(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length), TOKEN_TYPE_NAMES[tokenType]);
|
||||||
} else if (dataLen == -1) {
|
} else if (dataLen == -1) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,7 +55,7 @@ public final class ChineseTokenizer extends Tokenizer {
|
||||||
|
|
||||||
|
|
||||||
public ChineseTokenizer(Reader in) {
|
public ChineseTokenizer(Reader in) {
|
||||||
input = in;
|
super(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||||
|
@ -81,7 +81,7 @@ public final class ChineseTokenizer extends Tokenizer {
|
||||||
if (length>0) {
|
if (length>0) {
|
||||||
//System.out.println(new String(buffer, 0,
|
//System.out.println(new String(buffer, 0,
|
||||||
//length));
|
//length));
|
||||||
return token.reinit(buffer, 0, length, start, start+length);
|
return token.reinit(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -45,6 +45,7 @@ public class SentenceTokenizer extends Tokenizer {
|
||||||
private Token t = new Token();
|
private Token t = new Token();
|
||||||
|
|
||||||
public SentenceTokenizer(Reader reader) {
|
public SentenceTokenizer(Reader reader) {
|
||||||
|
super(reader);
|
||||||
bufferInput = new BufferedReader(reader, 2048);
|
bufferInput = new BufferedReader(reader, 2048);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,7 +92,7 @@ public class SentenceTokenizer extends Tokenizer {
|
||||||
return null;
|
return null;
|
||||||
else {
|
else {
|
||||||
t.clear();
|
t.clear();
|
||||||
t.reinit(buffer.toString(), tokenStart, tokenEnd, "sentence");
|
t.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence");
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -140,8 +140,8 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
||||||
int start = side == Side.FRONT ? 0 : inLen - gramSize;
|
int start = side == Side.FRONT ? 0 : inLen - gramSize;
|
||||||
int end = start + gramSize;
|
int end = start + gramSize;
|
||||||
reusableToken.setTermBuffer(inStr, start, gramSize);
|
reusableToken.setTermBuffer(inStr, start, gramSize);
|
||||||
reusableToken.setStartOffset(start);
|
reusableToken.setStartOffset(input.correctOffset(start));
|
||||||
reusableToken.setEndOffset(end);
|
reusableToken.setEndOffset(input.correctOffset(end));
|
||||||
gramSize++;
|
gramSize++;
|
||||||
return reusableToken;
|
return reusableToken;
|
||||||
}
|
}
|
||||||
|
|
|
@ -85,6 +85,6 @@ public class NGramTokenizer extends Tokenizer {
|
||||||
|
|
||||||
int oldPos = pos;
|
int oldPos = pos;
|
||||||
pos++;
|
pos++;
|
||||||
return reusableToken.reinit(inStr, oldPos, gramSize, oldPos, oldPos+gramSize);
|
return reusableToken.reinit(inStr, oldPos, gramSize, input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
|
|
||||||
package org.apache.lucene.wikipedia.analysis;
|
package org.apache.lucene.wikipedia.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharReader;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
|
@ -107,7 +108,7 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
private Iterator tokens = null;
|
private Iterator tokens = null;
|
||||||
|
|
||||||
void setInput(Reader reader) {
|
void setInput(Reader reader) {
|
||||||
this.input = reader;
|
this.input = CharReader.get(reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -190,8 +191,8 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
//trim the buffer
|
//trim the buffer
|
||||||
String s = buffer.toString().trim();
|
String s = buffer.toString().trim();
|
||||||
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
|
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||||
reusableToken.setStartOffset(theStart);
|
reusableToken.setStartOffset(input.correctOffset(theStart));
|
||||||
reusableToken.setEndOffset(theStart + s.length());
|
reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
|
||||||
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||||
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||||
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
||||||
|
@ -229,8 +230,8 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
//trim the buffer
|
//trim the buffer
|
||||||
String s = buffer.toString().trim();
|
String s = buffer.toString().trim();
|
||||||
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
|
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||||
reusableToken.setStartOffset(theStart);
|
reusableToken.setStartOffset(input.correctOffset(theStart));
|
||||||
reusableToken.setEndOffset(theStart + s.length());
|
reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
|
||||||
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||||
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||||
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
||||||
|
@ -243,8 +244,8 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
private void setupToken(final Token reusableToken) {
|
private void setupToken(final Token reusableToken) {
|
||||||
scanner.getText(reusableToken);
|
scanner.getText(reusableToken);
|
||||||
final int start = scanner.yychar();
|
final int start = scanner.yychar();
|
||||||
reusableToken.setStartOffset(start);
|
reusableToken.setStartOffset(input.correctOffset(start));
|
||||||
reusableToken.setEndOffset(start + reusableToken.termLength());
|
reusableToken.setEndOffset(input.correctOffset(start + reusableToken.termLength()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -258,7 +259,7 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void reset(Reader reader) throws IOException {
|
public void reset(Reader reader) throws IOException {
|
||||||
input = reader;
|
setInput(reader);
|
||||||
reset();
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,90 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base utility class for implementing a {@link
|
||||||
|
* CharFilter}. You record mappings by calling {@link
|
||||||
|
* #addOffCorrectMap}, and then invoke the correct method.
|
||||||
|
* This class is not particularly efficient, eg a new class
|
||||||
|
* instance is created for every call to {@link
|
||||||
|
* #addOffCorrectMap}, which is appended to a private list.
|
||||||
|
* When retrieving a mapping, that list is linearly
|
||||||
|
* checked.
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public abstract class BaseCharFilter extends CharFilter {
|
||||||
|
|
||||||
|
//private List<OffCorrectMap> pcmList;
|
||||||
|
private List pcmList;
|
||||||
|
|
||||||
|
public BaseCharFilter(CharStream in) {
|
||||||
|
super(in);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Retrieve the corrected offset. Note that this method
|
||||||
|
* is slow if you correct positions far before the most
|
||||||
|
* recently added position. */
|
||||||
|
protected int correct(int currentOff) {
|
||||||
|
if (pcmList == null || pcmList.isEmpty()) {
|
||||||
|
return currentOff;
|
||||||
|
}
|
||||||
|
for (int i = pcmList.size() - 1; i >= 0; i--) {
|
||||||
|
if (currentOff >= ((OffCorrectMap) pcmList.get(i)).off) {
|
||||||
|
return currentOff + ((OffCorrectMap) pcmList.get(i)).cumulativeDiff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return currentOff;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected int getLastCumulativeDiff() {
|
||||||
|
return pcmList == null || pcmList.isEmpty() ?
|
||||||
|
0 : ((OffCorrectMap)pcmList.get(pcmList.size() - 1)).cumulativeDiff;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void addOffCorrectMap(int off, int cumulativeDiff) {
|
||||||
|
if (pcmList == null) {
|
||||||
|
pcmList = new ArrayList();
|
||||||
|
}
|
||||||
|
pcmList.add(new OffCorrectMap(off, cumulativeDiff));
|
||||||
|
}
|
||||||
|
|
||||||
|
static class OffCorrectMap {
|
||||||
|
|
||||||
|
int off;
|
||||||
|
int cumulativeDiff;
|
||||||
|
|
||||||
|
OffCorrectMap(int off, int cumulativeDiff) {
|
||||||
|
this.off = off;
|
||||||
|
this.cumulativeDiff = cumulativeDiff;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
sb.append('(');
|
||||||
|
sb.append(off);
|
||||||
|
sb.append(',');
|
||||||
|
sb.append(cumulativeDiff);
|
||||||
|
sb.append(')');
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,61 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Subclasses of CharFilter can be chained to filter CharStream.
|
||||||
|
*
|
||||||
|
* @version $Id$
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public abstract class CharFilter extends CharStream {
|
||||||
|
|
||||||
|
protected CharStream input;
|
||||||
|
|
||||||
|
protected CharFilter(CharStream in) {
|
||||||
|
input = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Subclass may want to override to correct the current offset.
|
||||||
|
*
|
||||||
|
* @param currentOff current offset
|
||||||
|
* @return corrected offset
|
||||||
|
*/
|
||||||
|
protected int correct(int currentOff) {
|
||||||
|
return currentOff;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Chains the corrected offset through the input
|
||||||
|
* CharFilter.
|
||||||
|
*/
|
||||||
|
public final int correctOffset(int currentOff) {
|
||||||
|
return input.correctOffset(correct(currentOff));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
input.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
|
return input.read(cbuf, off, len);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,53 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CharReader is a Reader wrapper. It reads chars from Reader and outputs CharStream.
|
||||||
|
*
|
||||||
|
* @version $Id$
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public final class CharReader extends CharStream {
|
||||||
|
|
||||||
|
protected Reader input;
|
||||||
|
|
||||||
|
public static CharStream get(Reader input) {
|
||||||
|
return input instanceof CharStream ?
|
||||||
|
(CharStream)input : new CharReader(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
private CharReader(Reader in) {
|
||||||
|
input = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int correctOffset(int currentOff) {
|
||||||
|
return currentOff;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
input.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
|
return input.read(cbuf, off, len);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CharStream adds <a href="#correctOffset(int)">correctOffset</a> functionality over Reader.
|
||||||
|
*
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public abstract class CharStream extends Reader {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called by CharFilter(s) and Tokenizer to correct token offset.
|
||||||
|
*
|
||||||
|
* @param currentOff current offset
|
||||||
|
* @return corrected token offset
|
||||||
|
*/
|
||||||
|
public abstract int correctOffset(int currentOff);
|
||||||
|
}
|
|
@ -90,7 +90,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
termAtt.setTermLength(length);
|
termAtt.setTermLength(length);
|
||||||
offsetAtt.setOffset(start, start+length);
|
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -134,8 +134,8 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
reusableToken.setTermLength(length);
|
reusableToken.setTermLength(length);
|
||||||
reusableToken.setStartOffset(start);
|
reusableToken.setStartOffset(input.correctOffset(start));
|
||||||
reusableToken.setEndOffset(start+length);
|
reusableToken.setEndOffset(input.correctOffset(start+length));
|
||||||
return reusableToken;
|
return reusableToken;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -59,7 +59,7 @@ public class KeywordTokenizer extends Tokenizer {
|
||||||
buffer = termAtt.resizeTermBuffer(1+buffer.length);
|
buffer = termAtt.resizeTermBuffer(1+buffer.length);
|
||||||
}
|
}
|
||||||
termAtt.setTermLength(upto);
|
termAtt.setTermLength(upto);
|
||||||
offsetAtt.setOffset(0, upto);
|
offsetAtt.setOffset(input.correctOffset(0), input.correctOffset(upto));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
@ -81,8 +81,8 @@ public class KeywordTokenizer extends Tokenizer {
|
||||||
buffer = reusableToken.resizeTermBuffer(1+buffer.length);
|
buffer = reusableToken.resizeTermBuffer(1+buffer.length);
|
||||||
}
|
}
|
||||||
reusableToken.setTermLength(upto);
|
reusableToken.setTermLength(upto);
|
||||||
reusableToken.setStartOffset(0);
|
reusableToken.setStartOffset(input.correctOffset(0));
|
||||||
reusableToken.setEndOffset(upto);
|
reusableToken.setEndOffset(input.correctOffset(upto));
|
||||||
|
|
||||||
return reusableToken;
|
return reusableToken;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,140 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link CharFilter} that applies the mappings contained in
|
||||||
|
* a {@link NormalizeCharMap} to the character stream.
|
||||||
|
*
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class MappingCharFilter extends BaseCharFilter {
|
||||||
|
|
||||||
|
private final NormalizeCharMap normMap;
|
||||||
|
//private LinkedList<Character> buffer;
|
||||||
|
private LinkedList buffer;
|
||||||
|
private String replacement;
|
||||||
|
private int charPointer;
|
||||||
|
private int nextCharCounter;
|
||||||
|
|
||||||
|
public MappingCharFilter(NormalizeCharMap normMap, CharStream in) {
|
||||||
|
super(in);
|
||||||
|
this.normMap = normMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int read() throws IOException {
|
||||||
|
while(true) {
|
||||||
|
if (replacement != null && charPointer < replacement.length()) {
|
||||||
|
return replacement.charAt(charPointer++);
|
||||||
|
}
|
||||||
|
|
||||||
|
int firstChar = nextChar();
|
||||||
|
if (firstChar == -1) return -1;
|
||||||
|
NormalizeCharMap nm = normMap.submap != null ?
|
||||||
|
(NormalizeCharMap)normMap.submap.get(Character.valueOf((char) firstChar)) : null;
|
||||||
|
if (nm == null) return firstChar;
|
||||||
|
NormalizeCharMap result = match(nm);
|
||||||
|
if (result == null) return firstChar;
|
||||||
|
replacement = result.normStr;
|
||||||
|
charPointer = 0;
|
||||||
|
if (result.diff != 0) {
|
||||||
|
int prevCumulativeDiff = getLastCumulativeDiff();
|
||||||
|
if (result.diff < 0) {
|
||||||
|
for(int i = 0; i < -result.diff ; i++)
|
||||||
|
addOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i);
|
||||||
|
} else {
|
||||||
|
addOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int nextChar() throws IOException {
|
||||||
|
nextCharCounter++;
|
||||||
|
if (buffer != null && !buffer.isEmpty()) {
|
||||||
|
return ((Character)buffer.removeFirst()).charValue();
|
||||||
|
}
|
||||||
|
return input.read();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void pushChar(int c) {
|
||||||
|
nextCharCounter--;
|
||||||
|
if(buffer == null)
|
||||||
|
buffer = new LinkedList();
|
||||||
|
buffer.addFirst(new Character((char) c));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void pushLastChar(int c) {
|
||||||
|
if (buffer == null) {
|
||||||
|
buffer = new LinkedList();
|
||||||
|
}
|
||||||
|
buffer.addLast(new Character((char) c));
|
||||||
|
}
|
||||||
|
|
||||||
|
private NormalizeCharMap match(NormalizeCharMap map) throws IOException {
|
||||||
|
NormalizeCharMap result = null;
|
||||||
|
if (map.submap != null) {
|
||||||
|
int chr = nextChar();
|
||||||
|
if (chr != -1) {
|
||||||
|
NormalizeCharMap subMap = (NormalizeCharMap) map.submap.get(Character.valueOf((char) chr));
|
||||||
|
if (subMap != null) {
|
||||||
|
result = match(subMap);
|
||||||
|
}
|
||||||
|
if (result == null) {
|
||||||
|
pushChar(chr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (result == null && map.normStr != null) {
|
||||||
|
result = map;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
|
char[] tmp = new char[len];
|
||||||
|
int l = input.read(tmp, 0, len);
|
||||||
|
if (l != -1) {
|
||||||
|
for(int i = 0; i < l; i++)
|
||||||
|
pushLastChar(tmp[i]);
|
||||||
|
}
|
||||||
|
l = 0;
|
||||||
|
for(int i = off; i < off + len; i++) {
|
||||||
|
int c = read();
|
||||||
|
if (c == -1) break;
|
||||||
|
cbuf[i] = (char) c;
|
||||||
|
l++;
|
||||||
|
}
|
||||||
|
return l == 0 ? -1 : l;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean markSupported() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void mark(int readAheadLimit) throws IOException {
|
||||||
|
throw new IOException("mark/reset not supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() throws IOException {
|
||||||
|
throw new IOException("mark/reset not supported");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Holds a map of String input to String output, to be used
|
||||||
|
* with {@link MappingCharFilter}.
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class NormalizeCharMap {
|
||||||
|
|
||||||
|
//Map<Character, NormalizeMap> submap;
|
||||||
|
Map submap;
|
||||||
|
String normStr;
|
||||||
|
int diff;
|
||||||
|
|
||||||
|
public void add(String singleMatch, String replacement) {
|
||||||
|
NormalizeCharMap currMap = this;
|
||||||
|
for(int i = 0; i < singleMatch.length(); i++) {
|
||||||
|
char c = singleMatch.charAt(i);
|
||||||
|
if (currMap.submap == null) {
|
||||||
|
currMap.submap = new HashMap(1);
|
||||||
|
}
|
||||||
|
NormalizeCharMap map = (NormalizeCharMap) currMap.submap.get(Character.valueOf(c));
|
||||||
|
if (map == null) {
|
||||||
|
map = new NormalizeCharMap();
|
||||||
|
currMap.submap.put(new Character(c), map);
|
||||||
|
}
|
||||||
|
currMap = map;
|
||||||
|
}
|
||||||
|
if (currMap.normStr != null) {
|
||||||
|
throw new RuntimeException("MappingCharFilter: there is already a mapping for " + singleMatch);
|
||||||
|
}
|
||||||
|
currMap.normStr = replacement;
|
||||||
|
currMap.diff = singleMatch.length() - replacement.length();
|
||||||
|
}
|
||||||
|
}
|
|
@ -45,16 +45,20 @@ import java.io.IOException;
|
||||||
|
|
||||||
public abstract class Tokenizer extends TokenStream {
|
public abstract class Tokenizer extends TokenStream {
|
||||||
/** The text source for this Tokenizer. */
|
/** The text source for this Tokenizer. */
|
||||||
protected Reader input;
|
protected CharStream input;
|
||||||
|
|
||||||
/** Construct a tokenizer with null input. */
|
/** Construct a tokenizer with null input. */
|
||||||
protected Tokenizer() {}
|
protected Tokenizer() {}
|
||||||
|
|
||||||
/** Construct a token stream processing the given input. */
|
/** Construct a token stream processing the given input. */
|
||||||
protected Tokenizer(Reader input) {
|
protected Tokenizer(Reader input) {
|
||||||
this.input = input;
|
this.input = CharReader.get(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected Tokenizer(CharStream input) {
|
||||||
|
this.input = input;
|
||||||
|
}
|
||||||
|
|
||||||
/** By default, closes the input Reader. */
|
/** By default, closes the input Reader. */
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
input.close();
|
input.close();
|
||||||
|
@ -64,6 +68,10 @@ public abstract class Tokenizer extends TokenStream {
|
||||||
* analyzer (in its reusableTokenStream method) will use
|
* analyzer (in its reusableTokenStream method) will use
|
||||||
* this to re-use a previously created tokenizer. */
|
* this to re-use a previously created tokenizer. */
|
||||||
public void reset(Reader input) throws IOException {
|
public void reset(Reader input) throws IOException {
|
||||||
|
this.input = CharReader.get(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset(CharStream input) throws IOException {
|
||||||
this.input = input;
|
this.input = input;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.standard;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharReader;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
@ -91,7 +92,7 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
private boolean replaceInvalidAcronym;
|
private boolean replaceInvalidAcronym;
|
||||||
|
|
||||||
void setInput(Reader reader) {
|
void setInput(Reader reader) {
|
||||||
this.input = reader;
|
input = CharReader.get(reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||||
|
@ -126,7 +127,7 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
*/
|
*/
|
||||||
public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
|
public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
|
||||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
this.input = input;
|
setInput(input);
|
||||||
this.scanner = new StandardTokenizerImpl(input);
|
this.scanner = new StandardTokenizerImpl(input);
|
||||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
@ -161,7 +162,7 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
posIncrAtt.setPositionIncrement(posIncr);
|
posIncrAtt.setPositionIncrement(posIncr);
|
||||||
scanner.getText(termAtt);
|
scanner.getText(termAtt);
|
||||||
final int start = scanner.yychar();
|
final int start = scanner.yychar();
|
||||||
offsetAtt.setOffset(start, start+termAtt.termLength());
|
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+termAtt.termLength()));
|
||||||
// This 'if' should be removed in the next release. For now, it converts
|
// This 'if' should be removed in the next release. For now, it converts
|
||||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||||
// remain.
|
// remain.
|
||||||
|
@ -194,19 +195,19 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
int posIncr = 1;
|
int posIncr = 1;
|
||||||
|
|
||||||
while(true) {
|
while(true) {
|
||||||
int tokenType = scanner.getNextToken();
|
int tokenType = scanner.getNextToken();
|
||||||
|
|
||||||
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (scanner.yylength() <= maxTokenLength) {
|
if (scanner.yylength() <= maxTokenLength) {
|
||||||
reusableToken.clear();
|
reusableToken.clear();
|
||||||
reusableToken.setPositionIncrement(posIncr);
|
reusableToken.setPositionIncrement(posIncr);
|
||||||
scanner.getText(reusableToken);
|
scanner.getText(reusableToken);
|
||||||
final int start = scanner.yychar();
|
final int start = scanner.yychar();
|
||||||
reusableToken.setStartOffset(start);
|
reusableToken.setStartOffset(input.correctOffset(start));
|
||||||
reusableToken.setEndOffset(start+reusableToken.termLength());
|
reusableToken.setEndOffset(input.correctOffset(start+reusableToken.termLength()));
|
||||||
// This 'if' should be removed in the next release. For now, it converts
|
// This 'if' should be removed in the next release. For now, it converts
|
||||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||||
// remain.
|
// remain.
|
||||||
|
@ -234,13 +235,13 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
* @see org.apache.lucene.analysis.TokenStream#reset()
|
* @see org.apache.lucene.analysis.TokenStream#reset()
|
||||||
*/
|
*/
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
super.reset();
|
super.reset();
|
||||||
scanner.yyreset(input);
|
scanner.yyreset(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void reset(Reader reader) throws IOException {
|
public void reset(Reader reader) throws IOException {
|
||||||
input = reader;
|
setInput(reader);
|
||||||
reset();
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in New Issue