SOLR-822: Add CharFilter so that characters can be filtered before Tokenizer/TokenFilters.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@713902 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2008-11-14 01:56:21 +00:00
parent 4d7731fc90
commit eb0ec4a3e2
24 changed files with 1667 additions and 9 deletions

View File

@ -82,6 +82,9 @@ New Features
DirectoryProvider will use NIOFSDirectory for better concurrency
on non Windows platforms. (Mark Miller, TJ Laurenzo via yonik)
15. SOLR-822: Add CharFilter so that characters can be filtered (e.g. character normalization)
before Tokenizer/TokenFilters. (koji)
Optimizations
----------------------
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the

View File

@ -0,0 +1,246 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Syntax:
# "source" => "target"
# "source".length() > 0 (source cannot be empty.)
# "target".length() >= 0 (target can be empty.)
# example:
# "À" => "A"
# "\u00C0" => "A"
# "\u00C0" => "\u0041"
# "ß" => "ss"
# "\t" => " "
# "\n" => ""
# À => A
"\u00C0" => "A"
# Á => A
"\u00C1" => "A"
# Â => A
"\u00C2" => "A"
# Ã => A
"\u00C3" => "A"
# Ä => A
"\u00C4" => "A"
# Å => A
"\u00C5" => "A"
# Æ => AE
"\u00C6" => "AE"
# Ç => C
"\u00C7" => "C"
# È => E
"\u00C8" => "E"
# É => E
"\u00C9" => "E"
# Ê => E
"\u00CA" => "E"
# Ë => E
"\u00CB" => "E"
# Ì => I
"\u00CC" => "I"
# Í => I
"\u00CD" => "I"
# Î => I
"\u00CE" => "I"
# Ï => I
"\u00CF" => "I"
# IJ => IJ
"\u0132" => "IJ"
# Ð => D
"\u00D0" => "D"
# Ñ => N
"\u00D1" => "N"
# Ò => O
"\u00D2" => "O"
# Ó => O
"\u00D3" => "O"
# Ô => O
"\u00D4" => "O"
# Õ => O
"\u00D5" => "O"
# Ö => O
"\u00D6" => "O"
# Ø => O
"\u00D8" => "O"
# Œ => OE
"\u0152" => "OE"
# Þ
"\u00DE" => "TH"
# Ù => U
"\u00D9" => "U"
# Ú => U
"\u00DA" => "U"
# Û => U
"\u00DB" => "U"
# Ü => U
"\u00DC" => "U"
# Ý => Y
"\u00DD" => "Y"
# Ÿ => Y
"\u0178" => "Y"
# à => a
"\u00E0" => "a"
# á => a
"\u00E1" => "a"
# â => a
"\u00E2" => "a"
# ã => a
"\u00E3" => "a"
# ä => a
"\u00E4" => "a"
# å => a
"\u00E5" => "a"
# æ => ae
"\u00E6" => "ae"
# ç => c
"\u00E7" => "c"
# è => e
"\u00E8" => "e"
# é => e
"\u00E9" => "e"
# ê => e
"\u00EA" => "e"
# ë => e
"\u00EB" => "e"
# ì => i
"\u00EC" => "i"
# í => i
"\u00ED" => "i"
# î => i
"\u00EE" => "i"
# ï => i
"\u00EF" => "i"
# ij => ij
"\u0133" => "ij"
# ð => d
"\u00F0" => "d"
# ñ => n
"\u00F1" => "n"
# ò => o
"\u00F2" => "o"
# ó => o
"\u00F3" => "o"
# ô => o
"\u00F4" => "o"
# õ => o
"\u00F5" => "o"
# ö => o
"\u00F6" => "o"
# ø => o
"\u00F8" => "o"
# œ => oe
"\u0153" => "oe"
# ß => ss
"\u00DF" => "ss"
# þ => th
"\u00FE" => "th"
# ù => u
"\u00F9" => "u"
# ú => u
"\u00FA" => "u"
# û => u
"\u00FB" => "u"
# ü => u
"\u00FC" => "u"
# ý => y
"\u00FD" => "y"
# ÿ => y
"\u00FF" => "y"
# ff => ff
"\uFB00" => "ff"
# fi => fi
"\uFB01" => "fi"
# fl => fl
"\uFB02" => "fl"
# ffi => ffi
"\uFB03" => "ffi"
# ffl => ffl
"\uFB04" => "ffl"
# ſt => ft
"\uFB05" => "ft"
# st => st
"\uFB06" => "st"

View File

@ -215,6 +215,16 @@
</analyzer>
</fieldType>
<!-- charFilter + "CharStream aware" WhitespaceTokenizer -->
<!--
<fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
<analyzer>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
-->
<!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text

View File

@ -0,0 +1,67 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.ArrayList;
import java.util.List;
/**
*
* @version $Id$
* @since Solr 1.4
*
*/
public abstract class BaseCharFilter extends CharFilter {
protected List<PosCorrectMap> pcmList;
public BaseCharFilter( CharStream in ){
super(in);
pcmList = new ArrayList<PosCorrectMap>();
}
protected int correctPosition( int currentPos ){
if( pcmList.isEmpty() ) return currentPos;
for( int i = pcmList.size() - 1; i >= 0; i-- ){
if( currentPos >= pcmList.get( i ).pos )
return currentPos + pcmList.get( i ).cumulativeDiff;
}
return currentPos;
}
protected static class PosCorrectMap {
protected int pos;
protected int cumulativeDiff;
public PosCorrectMap( int pos, int cumulativeDiff ){
this.pos = pos;
this.cumulativeDiff = cumulativeDiff;
}
public String toString(){
StringBuffer sb = new StringBuffer();
sb.append('(');
sb.append(pos);
sb.append(',');
sb.append(cumulativeDiff);
sb.append(')');
return sb.toString();
}
}
}

View File

@ -0,0 +1,46 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @version $Id$
* @since Solr 1.4
*
*/
public abstract class BaseCharFilterFactory implements CharFilterFactory {
public static final Logger log = LoggerFactory.getLogger(BaseCharFilterFactory.class);
/** The init args */
protected Map<String,String> args;
public Map<String, String> getArgs() {
return args;
}
public void init(Map<String, String> args) {
this.args = args;
}
}

View File

@ -0,0 +1,63 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
/**
*
* Subclasses of CharFilter can be chained to filter CharStream.
*
* @version $Id$
* @since Solr 1.4
*
*/
public abstract class CharFilter extends CharStream {
protected CharStream input;
protected CharFilter( CharStream in ){
input = in;
}
/**
*
* Subclass may want to override to correct the current position.
*
* @param pos current position
* @return corrected position
*/
protected int correctPosition( int pos ){
return pos;
}
@Override
public final int correctOffset(int currentOff) {
return input.correctOffset( correctPosition( currentOff ) );
}
@Override
public void close() throws IOException {
input.close();
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return input.read(cbuf, off, len);
}
}

View File

@ -0,0 +1,32 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.Map;
/**
*
* @version $Id$
* @since Solr 1.4
*
*/
public interface CharFilterFactory {
public void init(Map<String,String> args);
public Map<String,String> getArgs();
public CharStream create(CharStream input);
}

View File

@ -0,0 +1,52 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.Reader;
/**
* CharReader is a Reader wrapper. It reads chars from Reader and outputs CharStream.
*
* @version $Id$
* @since Solr 1.4
*
*/
public final class CharReader extends CharStream {
protected Reader input;
public CharReader( Reader in ){
input = in;
}
@Override
public int correctOffset(int currentOff) {
return currentOff;
}
@Override
public void close() throws IOException {
input.close();
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return input.read(cbuf, off, len );
}
}

View File

@ -0,0 +1,38 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.Reader;
/**
* CharStream adds <a href="#correctOffset(int)">correctOffset</a> functionality over Reader.
*
* @version $Id$
* @since Solr 1.4
*
*/
public abstract class CharStream extends Reader {
/**
* called by CharFilter(s) and Tokenizer to correct token offset.
*
* @param currentOff current offset
* @return corrected token offset
*/
public abstract int correctOffset( int currentOff );
}

View File

@ -0,0 +1,276 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Tokenizer;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import java.io.Reader;
/**
* CJKTokenizer was modified from StopTokenizer which does a decent job for
* most European languages. It performs other token methods for double-byte
* Characters: the token will return at each two characters with overlap match.<br>
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
* also need filter filter zero length token ""<br>
* for Digit: digit, '+', '#' will token as letter<br>
* for more info on Asia language(Chinese Japanese Korean) text segmentation:
* please search <a
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
*
*/
/*
* LUCENE-973 is applied
*/
/**
*
* @version $Id$
* @since Solr 1.4
*
*/
public final class CharStreamAwareCJKTokenizer extends Tokenizer {
//~ Static fields/initializers ---------------------------------------------
/** Word token type */
static final int WORD_TYPE = 0;
/** Single byte token type */
static final int SINGLE_TOKEN_TYPE = 1;
/** Double byte token type */
static final int DOUBLE_TOKEN_TYPE = 2;
/** Names for token types */
static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
/** Max word length */
private static final int MAX_WORD_LEN = 255;
/** buffer size: */
private static final int IO_BUFFER_SIZE = 256;
//~ Instance fields --------------------------------------------------------
/** word offset, used to imply which character(in ) is parsed */
private int offset = 0;
/** the index used only for ioBuffer */
private int bufferIndex = 0;
/** data length */
private int dataLen = 0;
/**
* character buffer, store the characters which are used to compose <br>
* the returned Token
*/
private final char[] buffer = new char[MAX_WORD_LEN];
/**
* I/O buffer, used to store the content of the input(one of the <br>
* members of Tokenizer)
*/
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** word type: single=>ASCII double=>non-ASCII word=>default */
private int tokenType = WORD_TYPE;
/**
* tag: previous character is a cached double-byte character "C1C2C3C4"
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
*/
private boolean preIsTokened = false;
//~ Constructors -----------------------------------------------------------
/**
* Construct a token stream processing the given input.
*
* @param in I/O reader
*/
public CharStreamAwareCJKTokenizer(CharStream in) {
input = in;
}
//~ Methods ----------------------------------------------------------------
/**
* Returns the next token in the stream, or null at EOS.
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
* for detail.
*
* @param reusableToken a reusable token
* @return Token
*
* @throws java.io.IOException - throw IOException when read error <br>
* happened in the InputStream
*
*/
public final Token next(final Token reusableToken) throws java.io.IOException {
/** how many character(s) has been stored in buffer */
assert reusableToken != null;
int length = 0;
/** the position used to create Token */
int start = offset;
while (true) {
/** current character */
char c;
/** unicode block of current character for detail */
Character.UnicodeBlock ub;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
if (length > 0) {
if (preIsTokened == true) {
length = 0;
preIsTokened = false;
}
break;
} else {
return null;
}
} else {
//get current character
c = ioBuffer[bufferIndex++];
//get the UnicodeBlock of the current character
ub = Character.UnicodeBlock.of(c);
}
//if the current character is ASCII or Extend ASCII
if ((ub == Character.UnicodeBlock.BASIC_LATIN)
|| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
) {
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
// convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
int i = (int) c;
i = i - 65248;
c = (char) i;
}
// if the current character is a letter or "_" "+" "#"
if (Character.isLetterOrDigit(c)
|| ((c == '_') || (c == '+') || (c == '#'))
) {
if (length == 0) {
// "javaC1C2C3C4linux" <br>
// ^--: the current character begin to token the ASCII
// letter
start = offset - 1;
} else if (tokenType == DOUBLE_TOKEN_TYPE) {
// "javaC1C2C3C4linux" <br>
// ^--: the previous non-ASCII
// : the current character
offset--;
bufferIndex--;
if (preIsTokened == true) {
// there is only one non-ASCII has been stored
length = 0;
preIsTokened = false;
break;
} else {
break;
}
}
// store the LowerCase(c) in the buffer
buffer[length++] = Character.toLowerCase(c);
tokenType = SINGLE_TOKEN_TYPE;
// break the procedure if buffer overflowed!
if (length == MAX_WORD_LEN) {
break;
}
} else if (length > 0) {
if (preIsTokened == true) {
length = 0;
preIsTokened = false;
} else {
break;
}
}
} else {
// non-ASCII letter, e.g."C1C2C3C4"
if (Character.isLetter(c)) {
if (length == 0) {
start = offset - 1;
buffer[length++] = c;
tokenType = DOUBLE_TOKEN_TYPE;
} else {
if (tokenType == SINGLE_TOKEN_TYPE) {
offset--;
bufferIndex--;
//return the previous ASCII characters
break;
} else {
buffer[length++] = c;
tokenType = DOUBLE_TOKEN_TYPE;
if (length == 2) {
offset--;
bufferIndex--;
preIsTokened = true;
break;
}
}
}
} else if (length > 0) {
if (preIsTokened == true) {
// empty the buffer
length = 0;
preIsTokened = false;
} else {
break;
}
}
}
}
if (length > 0) {
// Because of "CharStream aware" tokenizer, using correctOffset() to
// correct start/end offsets
return reusableToken.reinit
(buffer, 0, length,
((CharStream)input).correctOffset( start ),
((CharStream)input).correctOffset( start+length ),
TOKEN_TYPE_NAMES[tokenType]);
} else if (dataLen != -1) {
// Don't return an empty string - recurse to get the next token
return next(reusableToken);
} else {
return null;
}
}
}

View File

@ -0,0 +1,35 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.TokenStream;
/**
*
* @version $Id$
* @since Solr 1.4
*
*/
public class CharStreamAwareCJKTokenizerFactory extends BaseTokenizerFactory {
public TokenStream create(Reader input) {
return new CharStreamAwareCJKTokenizer( (CharStream)input );
}
}

View File

@ -0,0 +1,102 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
/** An abstract base class for simple, character-oriented tokenizers.*/
public abstract class CharStreamAwareCharTokenizer extends Tokenizer {
public CharStreamAwareCharTokenizer(CharStream input) {
super(input);
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** Returns true iff a character should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
* satisfy this predicate. Characters for which this is false are used to
* define token boundaries and are not included in tokens. */
protected abstract boolean isTokenChar(char c);
/** Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this
* to, e.g., lowercase tokens. */
protected char normalize(char c) {
return c;
}
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
reusableToken.clear();
int length = 0;
int start = bufferIndex;
char[] buffer = reusableToken.termBuffer();
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
dataLen = input.read(ioBuffer);
if (dataLen == -1) {
if (length > 0)
break;
else
return null;
}
bufferIndex = 0;
}
final char c = ioBuffer[bufferIndex++];
if (isTokenChar(c)) { // if it's a token char
if (length == 0) // start of token
start = offset + bufferIndex - 1;
else if (length == buffer.length)
buffer = reusableToken.resizeTermBuffer(1+length);
buffer[length++] = normalize(c); // buffer it, normalized
if (length == MAX_WORD_LEN) // buffer overflow!
break;
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
}
reusableToken.setTermLength(length);
// Because of "CharStream aware" tokenizer, using correctOffset() to
// correct start/end offsets
reusableToken.setStartOffset(((CharStream)input).correctOffset(start));
reusableToken.setEndOffset(((CharStream)input).correctOffset(start+length));
return reusableToken;
}
public void reset(Reader input) throws IOException {
super.reset(input);
bufferIndex = 0;
offset = 0;
dataLen = 0;
}
}

View File

@ -0,0 +1,33 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
* Adjacent sequences of non-Whitespace characters form tokens. */
public class CharStreamAwareWhitespaceTokenizer extends CharStreamAwareCharTokenizer {
/** Construct a new WhitespaceTokenizer. */
public CharStreamAwareWhitespaceTokenizer(CharStream in) {
super(in);
}
/** Collects only characters which do not satisfy
* {@link Character#isWhitespace(char)}.*/
protected boolean isTokenChar(char c) {
return !Character.isWhitespace(c);
}
}

View File

@ -0,0 +1,35 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.TokenStream;
/**
*
* @version $Id$
* @since Solr 1.4
*
*/
public class CharStreamAwareWhitespaceTokenizerFactory extends BaseTokenizerFactory {
public TokenStream create(Reader input) {
return new CharStreamAwareWhitespaceTokenizer( (CharStream)input );
}
}

View File

@ -0,0 +1,136 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.LinkedList;
/**
*
* @version $Id$
* @since Solr 1.4
*
*/
public class MappingCharFilter extends BaseCharFilter {
private final NormalizeMap normMap;
private LinkedList<Character> buffer;
private String replacement;
private int charPointer;
private int nextCharCounter;
public MappingCharFilter( NormalizeMap normMap, CharStream in ){
super( in );
this.normMap = normMap;
}
public int read() throws IOException {
while( true ){
if( replacement != null && charPointer < replacement.length() )
return replacement.charAt( charPointer++ );
int firstChar = nextChar();
if( firstChar == -1 ) return -1;
NormalizeMap nm = normMap.submap != null ?
normMap.submap.get( (char)firstChar ) : null;
if( nm == null ) return firstChar;
NormalizeMap result = match( nm );
if( result == null ) return firstChar;
replacement = result.normStr;
charPointer = 0;
if( result.diff != 0 ){
int prevCumulativeDiff = pcmList.isEmpty() ? 0 :
pcmList.get( pcmList.size() - 1 ).cumulativeDiff;
if( result.diff < 0 ){
for( int i = 0; i < -result.diff ; i++ )
pcmList.add( new PosCorrectMap( nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i ) );
}
else{
pcmList.add( new PosCorrectMap( nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff ) );
}
}
}
}
private int nextChar() throws IOException {
nextCharCounter++;
if( buffer != null && !buffer.isEmpty() )
return buffer.removeFirst();
return input.read();
}
private void pushChar( int c ){
nextCharCounter--;
if( buffer == null )
buffer = new LinkedList<Character>();
buffer.addFirst( (char)c );
}
private void pushLastChar( int c ){
if( buffer == null )
buffer = new LinkedList<Character>();
buffer.addLast( (char)c );
}
private NormalizeMap match( NormalizeMap map ) throws IOException {
NormalizeMap result = null;
if( map.submap != null ){
int chr = nextChar();
if( chr != -1 ){
NormalizeMap subMap = map.submap.get( (char)chr );
if( subMap != null ){
result = match( subMap );
}
if( result == null )
pushChar( chr );
}
}
if( result == null && map.normStr != null )
result = map;
return result;
}
public int read( char[] cbuf, int off, int len ) throws IOException {
char[] tmp = new char[len];
int l = input.read( tmp, 0, len );
if( l != -1 ){
for( int i = 0; i < l; i++ )
pushLastChar( tmp[i] );
}
l = 0;
for( int i = off; i < off + len; i++ ){
int c = read();
if( c == -1 ) break;
cbuf[i] = (char)c;
l++;
}
return l == 0 ? -1 : l;
}
public boolean markSupported(){
return false;
}
public void mark( int readAheadLimit ) throws IOException {
throw new IOException( "mark/reset not supported" );
}
public void reset() throws IOException {
throw new IOException( "mark/reset not supported" );
}
}

View File

@ -0,0 +1,118 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
*
* @version $Id$
* @since Solr 1.4
*
*/
public class MappingCharFilterFactory extends BaseCharFilterFactory implements
ResourceLoaderAware {
protected NormalizeMap normMap;
private String mapping;
public void inform(ResourceLoader loader) {
mapping = args.get( "mapping" );
if( mapping != null ){
List<String> wlist = null;
try{
File mappingFile = new File( mapping );
if( mappingFile.exists() ){
wlist = loader.getLines( mapping );
}
else{
List<String> files = StrUtils.splitFileNames( mapping );
wlist = new ArrayList<String>();
for( String file : files ){
List<String> lines = loader.getLines( file.trim() );
wlist.addAll( lines );
}
}
}
catch( IOException e ){
throw new RuntimeException( e );
}
normMap = new NormalizeMap();
parseRules( wlist, normMap );
}
}
public CharStream create(CharStream input) {
return new MappingCharFilter(normMap,input);
}
// "source" => "target"
static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
protected void parseRules( List<String> rules, NormalizeMap normMap ){
for( String rule : rules ){
Matcher m = p.matcher( rule );
if( !m.find() )
throw new RuntimeException( "Invalid Mapping Rule : [" + rule + "], file = " + mapping );
normMap.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) );
}
}
char[] out = new char[256];
protected String parseString( String s ){
int readPos = 0;
int len = s.length();
int writePos = 0;
while( readPos < len ){
char c = s.charAt( readPos++ );
if( c == '\\' ){
if( readPos >= len )
throw new RuntimeException( "Invalid escaped char in [" + s + "]" );
c = s.charAt( readPos++ );
switch( c ) {
case '\\' : c = '\\'; break;
case '"' : c = '"'; break;
case 'n' : c = '\n'; break;
case 't' : c = '\t'; break;
case 'r' : c = '\r'; break;
case 'b' : c = '\b'; break;
case 'f' : c = '\f'; break;
case 'u' :
if( readPos + 3 >= len )
throw new RuntimeException( "Invalid escaped char in [" + s + "]" );
c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
readPos += 4;
break;
}
}
out[writePos++] = c;
}
return new String( out, 0, writePos );
}
}

View File

@ -0,0 +1,55 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.HashMap;
import java.util.Map;
/**
*
* @version $Id$
* @since Solr 1.4
*
*/
public class NormalizeMap {
Map<Character, NormalizeMap> submap;
String normStr;
int diff;
public void add( String singleMatch, String replacement ){
NormalizeMap currMap = this;
for( int i = 0; i < singleMatch.length(); i++ ){
char c = singleMatch.charAt( i );
if( currMap.submap == null ){
currMap.submap = new HashMap<Character, NormalizeMap>( 1 );
}
NormalizeMap map = currMap.submap.get( c );
if( map == null ){
map = new NormalizeMap();
currMap.submap.put( c, map );
}
currMap = map;
}
if( currMap.normStr != null ){
throw new RuntimeException( "MappingCharFilter: there is already a mapping for " + singleMatch );
}
currMap.normStr = replacement;
currMap.diff = singleMatch.length() - replacement.length();
}
}

View File

@ -31,19 +31,37 @@ import java.io.Reader;
// create a TokenStream.
//
public class TokenizerChain extends SolrAnalyzer {
final private CharFilterFactory[] charFilters;
final private TokenizerFactory tokenizer;
final private TokenFilterFactory[] filters;
public TokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
this(null,tokenizer,filters);
}
public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
this.charFilters = charFilters;
this.tokenizer = tokenizer;
this.filters = filters;
}
public CharFilterFactory[] getCharFilterFactories() { return charFilters; }
public TokenizerFactory getTokenizerFactory() { return tokenizer; }
public TokenFilterFactory[] getTokenFilterFactories() { return filters; }
public Reader charStream(Reader reader){
if( charFilters != null && charFilters.length > 0 ){
CharStream cs = new CharReader( reader );
for (int i=0; i<charFilters.length; i++) {
cs = charFilters[i].create(cs);
}
reader = cs;
}
return reader;
}
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream ts = tokenizer.create(reader);
TokenStream ts = tokenizer.create(charStream(reader));
for (int i=0; i<filters.length; i++) {
ts = filters[i].create(ts);
}
@ -52,6 +70,10 @@ public class TokenizerChain extends SolrAnalyzer {
public String toString() {
StringBuilder sb = new StringBuilder("TokenizerChain(");
for (CharFilterFactory filter: charFilters) {
sb.append(filter);
sb.append(", ");
}
sb.append(tokenizer);
for (TokenFilterFactory filter: filters) {
sb.append(", ");

View File

@ -37,6 +37,7 @@ import javax.naming.InitialContext;
import javax.naming.NamingException;
import javax.naming.NoInitialContextException;
import org.apache.solr.analysis.CharFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerFactory;
import org.apache.solr.common.ResourceLoader;
@ -394,8 +395,9 @@ public class SolrResourceLoader implements ResourceLoader
}
);
awareCompatibility.put(
awareCompatibility.put(
ResourceLoaderAware.class, new Class[] {
CharFilterFactory.class,
TokenFilterFactory.class,
TokenizerFactory.class,
FieldType.class
@ -427,5 +429,5 @@ public class SolrResourceLoader implements ResourceLoader
}
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, builder.toString() );
}
}

View File

@ -29,6 +29,7 @@ import org.apache.solr.common.util.DOMUtil;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.Config;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.analysis.CharFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.analysis.TokenizerFactory;
@ -739,12 +740,33 @@ public final class IndexSchema {
XPath xpath = XPathFactory.newInstance().newXPath();
// Load the CharFilters
// --------------------------------------------------------------------------------
final ArrayList<CharFilterFactory> charFilters = new ArrayList<CharFilterFactory>();
AbstractPluginLoader<CharFilterFactory> charFilterLoader =
new AbstractPluginLoader<CharFilterFactory>( "[schema.xml] analyzer/charFilter", false, false )
{
@Override
protected void init(CharFilterFactory plugin, Node node) throws Exception {
if( plugin != null ) {
plugin.init( DOMUtil.toMapExcept(node.getAttributes(),"class") );
charFilters.add( plugin );
}
}
@Override
protected CharFilterFactory register(String name, CharFilterFactory plugin) throws Exception {
return null; // used for map registration
}
};
charFilterLoader.load( solrConfig.getResourceLoader(), (NodeList)xpath.evaluate("./charFilter", node, XPathConstants.NODESET) );
// Load the Tokenizer
// Although an analyzer only allows a single Tokenizer, we load a list to make sure
// Although an analyzer only allows a single Tokenizer, we load a list to make sure
// the configuration is ok
// --------------------------------------------------------------------------------
final ArrayList<TokenizerFactory> tokenizers = new ArrayList<TokenizerFactory>(1);
AbstractPluginLoader<TokenizerFactory> tokenizerLoader =
AbstractPluginLoader<TokenizerFactory> tokenizerLoader =
new AbstractPluginLoader<TokenizerFactory>( "[schema.xml] analyzer/tokenizer", false, false )
{
@Override
@ -790,8 +812,9 @@ public final class IndexSchema {
}
};
filterLoader.load( loader, (NodeList)xpath.evaluate("./filter", node, XPathConstants.NODESET) );
return new TokenizerChain(tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));
return new TokenizerChain(charFilters.toArray(new CharFilterFactory[charFilters.size()]),
tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));
};

View File

@ -0,0 +1,52 @@
package org.apache.solr.analysis;
import java.io.StringReader;
import junit.framework.TestCase;
public class TestCharFilter extends TestCase {
public void testCharFilter1() throws Exception {
CharStream cs = new CharFilter1( new CharReader( new StringReader("") ) );
assertEquals( "corrected position is invalid", 1, cs.correctOffset( 0 ) );
}
public void testCharFilter2() throws Exception {
CharStream cs = new CharFilter2( new CharReader( new StringReader("") ) );
assertEquals( "corrected position is invalid", 2, cs.correctOffset( 0 ) );
}
public void testCharFilter12() throws Exception {
CharStream cs = new CharFilter2( new CharFilter1( new CharReader( new StringReader("") ) ) );
assertEquals( "corrected position is invalid", 3, cs.correctOffset( 0 ) );
}
public void testCharFilter11() throws Exception {
CharStream cs = new CharFilter1( new CharFilter1( new CharReader( new StringReader("") ) ) );
assertEquals( "corrected position is invalid", 2, cs.correctOffset( 0 ) );
}
static class CharFilter1 extends CharFilter {
protected CharFilter1(CharStream in) {
super(in);
}
@Override
protected int correctPosition(int currentPos) {
return currentPos + 1;
}
}
static class CharFilter2 extends CharFilter {
protected CharFilter2(CharStream in) {
super(in);
}
@Override
protected int correctPosition(int currentPos) {
return currentPos + 2;
}
}
}

View File

@ -0,0 +1,160 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.StringReader;
import java.util.List;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class TestMappingCharFilter extends BaseTokenTestCase {
NormalizeMap normMap;
public void setUp() throws Exception {
normMap = new NormalizeMap();
normMap.add( "aa", "a" );
normMap.add( "bbb", "b" );
normMap.add( "cccc", "cc" );
normMap.add( "h", "i" );
normMap.add( "j", "jj" );
normMap.add( "k", "kkk" );
normMap.add( "ll", "llll" );
normMap.add( "empty", "" );
}
public void testNothingChange() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "x" ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
List<Token> expect = tokens( "x" );
assertTokEqualOff( expect, real );
}
public void test1to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "h" ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
List<Token> expect = tokens( "i" );
assertTokEqualOff( expect, real );
}
public void test1to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "j" ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
List<Token> expect = tokens( "jj,1,0,1" );
assertTokEqualOff( expect, real );
}
public void test1to3() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "k" ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
List<Token> expect = tokens( "kkk,1,0,1" );
assertTokEqualOff( expect, real );
}
public void test2to4() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "ll" ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
List<Token> expect = tokens( "llll,1,0,2" );
assertTokEqualOff( expect, real );
}
public void test2to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "aa" ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
List<Token> expect = tokens( "a,1,0,2" );
assertTokEqualOff( expect, real );
}
public void test3to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "bbb" ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
List<Token> expect = tokens( "b,1,0,3" );
assertTokEqualOff( expect, real );
}
public void test4to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "cccc" ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
List<Token> expect = tokens( "cc,1,0,4" );
assertTokEqualOff( expect, real );
}
public void test5to0() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "empty" ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
assertEquals( 0, real.size() );
}
//
// 1111111111222
// 01234567890123456789012
//(in) h i j k ll cccc bbb aa
//
// 1111111111222
// 01234567890123456789012
//(out) i i jj kkk llll cc b a
//
// h, 0, 1 => i, 0, 1
// i, 2, 3 => i, 2, 3
// j, 4, 5 => jj, 4, 5
// k, 6, 7 => kkk, 6, 7
// ll, 8,10 => llll, 8,10
// cccc,11,15 => cc,11,15
// bbb,16,19 => b,16,19
// aa,20,22 => a,20,22
//
public void testTokenStream() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "h i j k ll cccc bbb aa" ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
List<Token> expect = tokens( "i,1,0,1 i,1,2,3 jj,1,4,5 kkk,1,6,7 llll,1,8,10 cc,1,11,15 b,1,16,19 a,1,20,22" );
assertTokEqualOff( expect, real );
}
//
//
// 0123456789
//(in) aaaa ll h
//(out-1) aa llll i
//(out-2) a llllllll i
//
// aaaa,0,4 => a,0,4
// ll,5,7 => llllllll,5,7
// h,8,9 => i,8,9
public void testChained() throws Exception {
CharStream cs = new MappingCharFilter( normMap,
new MappingCharFilter( normMap, new CharReader( new StringReader( "aaaa ll h" ) ) ) );
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
List<Token> real = getTokens( ts );
List<Token> expect = tokens( "a,1,0,4 llllllll,1,5,7 i,1,8,9" );
assertTokEqualOff( expect, real );
}
}

View File

@ -0,0 +1,52 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import junit.framework.TestCase;
public class TestMappingCharFilterFactory extends TestCase {
public void testParseString() throws Exception {
MappingCharFilterFactory f = new MappingCharFilterFactory();
try {
f.parseString( "\\" );
fail( "escape character cannot be alone." );
}
catch( RuntimeException expected ){}
assertEquals( "unexpected escaped characters",
"\\\"\n\t\r\b\f", f.parseString( "\\\\\\\"\\n\\t\\r\\b\\f" ) );
assertEquals( "unexpected escaped characters",
"A", f.parseString( "\\u0041" ) );
assertEquals( "unexpected escaped characters",
"AB", f.parseString( "\\u0041\\u0042" ) );
try {
f.parseString( "\\u000" );
fail( "invalid length check." );
}
catch( RuntimeException expected ){}
try {
f.parseString( "\\u123x" );
fail( "invalid hex number check." );
}
catch( NumberFormatException expected ){}
}
}

View File

@ -181,9 +181,9 @@
TokenizerFactory tfac = tchain.getTokenizerFactory();
TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();
TokenStream tstream = tfac.create(reader);
TokenStream tstream = tfac.create(tchain.charStream(reader));
List<Token> tokens = getTokens(tstream);
tstream = tfac.create(reader);
tstream = tfac.create(tchain.charStream(reader));
if (verbose) {
writeHeader(out, tfac.getClass(), tfac.getArgs());
}