mirror of https://github.com/apache/lucene.git
Added JapaneseIterationMarkCharFilter (LUCENE-4201)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1359613 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e78a40adcd
commit
a15331043f
|
@ -11,6 +11,9 @@ http://s.apache.org/luceneversions
|
|||
|
||||
New features
|
||||
|
||||
* LUCENE-4201: Added JapaneseIterationMarkCharFilter to normalize Japanese
|
||||
iteration marks. (Robert Muir, Christian Moen)
|
||||
|
||||
* LUCENE-3832: Added BasicAutomata.makeStringUnion method to efficiently
|
||||
create automata from a fixed collection of UTF-8 encoded BytesRef
|
||||
(Dawid Weiss, Robert Muir)
|
||||
|
|
|
@ -0,0 +1,456 @@
|
|||
package org.apache.lucene.analysis.ja;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.charfilter.CharFilter;
|
||||
import org.apache.lucene.util.RollingCharBuffer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
|
||||
* <p>
|
||||
* Sequences of iteration marks are supported. In case an illegal sequence of iteration
|
||||
* marks is encountered, the implementation emits the illegal source character as-is
|
||||
* without considering its script. For example, with input "?ゝ", we get
|
||||
* "??" even though "?" isn't hiragana.
|
||||
* </p>
|
||||
* <p>
|
||||
* Note that a full stop punctuation character "。" (U+3002) can not be iterated
|
||||
* (see below). Iteration marks themselves can be emitted in case they are illegal,
|
||||
* i.e. if they go back past the beginning of the character stream.
|
||||
* </p>
|
||||
* <p>
|
||||
* The implementation buffers input until a full stop punctuation character (U+3002)
|
||||
* or EOF is reached in order to not keep a copy of the character stream in memory.
|
||||
* Vertical iteration marks, which are even rarer than horizonal iteration marks in
|
||||
* contemporary Japanese, are unsupported.
|
||||
* </p>
|
||||
*/
|
||||
public class JapaneseIterationMarkCharFilter extends CharFilter {
|
||||
|
||||
/** Normalize kanji iteration marks by default */
|
||||
public static final boolean NORMALIZE_KANJI_DEFAULT = true;
|
||||
|
||||
/** Normalize kana iteration marks by default */
|
||||
public static final boolean NORMALIZE_KANA_DEFAULT = true;
|
||||
|
||||
private static final char KANJI_ITERATION_MARK = '\u3005'; // 々
|
||||
|
||||
private static final char HIRAGANA_ITERATION_MARK = '\u309d'; // ゝ
|
||||
|
||||
private static final char HIRAGANA_VOICED_ITERATION_MARK = '\u309e'; // ゞ
|
||||
|
||||
private static final char KATAKANA_ITERATION_MARK = '\u30fd'; // ヽ
|
||||
|
||||
private static final char KATAKANA_VOICED_ITERATION_MARK = '\u30fe'; // ヾ
|
||||
|
||||
private static final char FULL_STOP_PUNCTUATION = '\u3002'; // 。
|
||||
|
||||
// Hiragana to dakuten map (lookup using code point - 0x30ab(か)*/
|
||||
private static char[] h2d = new char[50];
|
||||
|
||||
// Katakana to dakuten map (lookup using code point - 0x30ab(カ
|
||||
private static char[] k2d = new char[50];
|
||||
|
||||
private final RollingCharBuffer buffer = new RollingCharBuffer();
|
||||
|
||||
private int bufferPosition = 0;
|
||||
|
||||
private int iterationMarksSpanSize = 0;
|
||||
|
||||
private int iterationMarkSpanEndPosition = 0;
|
||||
|
||||
private boolean normalizeKanji;
|
||||
|
||||
private boolean normalizeKana;
|
||||
|
||||
static {
|
||||
// Hiragana dakuten map
|
||||
h2d[0] = '\u304c'; // か => が
|
||||
h2d[1] = '\u304c'; // が => が
|
||||
h2d[2] = '\u304e'; // き => ぎ
|
||||
h2d[3] = '\u304e'; // ぎ => ぎ
|
||||
h2d[4] = '\u3050'; // く => ぐ
|
||||
h2d[5] = '\u3050'; // ぐ => ぐ
|
||||
h2d[6] = '\u3052'; // け => げ
|
||||
h2d[7] = '\u3052'; // げ => げ
|
||||
h2d[8] = '\u3054'; // こ => ご
|
||||
h2d[9] = '\u3054'; // ご => ご
|
||||
h2d[10] = '\u3056'; // さ => ざ
|
||||
h2d[11] = '\u3056'; // ざ => ざ
|
||||
h2d[12] = '\u3058'; // し => じ
|
||||
h2d[13] = '\u3058'; // じ => じ
|
||||
h2d[14] = '\u305a'; // す => ず
|
||||
h2d[15] = '\u305a'; // ず => ず
|
||||
h2d[16] = '\u305c'; // せ => ぜ
|
||||
h2d[17] = '\u305c'; // ぜ => ぜ
|
||||
h2d[18] = '\u305e'; // そ => ぞ
|
||||
h2d[19] = '\u305e'; // ぞ => ぞ
|
||||
h2d[20] = '\u3060'; // た => だ
|
||||
h2d[21] = '\u3060'; // だ => だ
|
||||
h2d[22] = '\u3062'; // ち => ぢ
|
||||
h2d[23] = '\u3062'; // ぢ => ぢ
|
||||
h2d[24] = '\u3063';
|
||||
h2d[25] = '\u3065'; // つ => づ
|
||||
h2d[26] = '\u3065'; // づ => づ
|
||||
h2d[27] = '\u3067'; // て => で
|
||||
h2d[28] = '\u3067'; // で => で
|
||||
h2d[29] = '\u3069'; // と => ど
|
||||
h2d[30] = '\u3069'; // ど => ど
|
||||
h2d[31] = '\u306a';
|
||||
h2d[32] = '\u306b';
|
||||
h2d[33] = '\u306c';
|
||||
h2d[34] = '\u306d';
|
||||
h2d[35] = '\u306e';
|
||||
h2d[36] = '\u3070'; // は => ば
|
||||
h2d[37] = '\u3070'; // ば => ば
|
||||
h2d[38] = '\u3071';
|
||||
h2d[39] = '\u3073'; // ひ => び
|
||||
h2d[40] = '\u3073'; // び => び
|
||||
h2d[41] = '\u3074';
|
||||
h2d[42] = '\u3076'; // ふ => ぶ
|
||||
h2d[43] = '\u3076'; // ぶ => ぶ
|
||||
h2d[44] = '\u3077';
|
||||
h2d[45] = '\u3079'; // へ => べ
|
||||
h2d[46] = '\u3079'; // べ => べ
|
||||
h2d[47] = '\u307a';
|
||||
h2d[48] = '\u307c'; // ほ => ぼ
|
||||
h2d[49] = '\u307c'; // ぼ => ぼ
|
||||
|
||||
// Make katakana dakuten map from hiragana map
|
||||
char codePointDifference = '\u30ab' - '\u304b'; // カ - か
|
||||
assert h2d.length == k2d.length;
|
||||
for (int i = 0; i < k2d.length; i++) {
|
||||
k2d[i] = (char) (h2d[i] + codePointDifference);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor. Normalizes both kanji and kana iteration marks by default.
|
||||
*
|
||||
* @param input char stream
|
||||
*/
|
||||
public JapaneseIterationMarkCharFilter(CharStream input) {
|
||||
this(input, NORMALIZE_KANJI_DEFAULT, NORMALIZE_KANA_DEFAULT);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param input char stream
|
||||
* @param normalizeKanji indicates whether kanji iteration marks should be normalized
|
||||
* @param normalizeKana indicates whether kana iteration marks should be normalized
|
||||
*/
|
||||
public JapaneseIterationMarkCharFilter(CharStream input, boolean normalizeKanji, boolean normalizeKana) {
|
||||
super(input);
|
||||
this.normalizeKanji = normalizeKanji;
|
||||
this.normalizeKana = normalizeKana;
|
||||
buffer.reset(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public int read(char[] buffer, int offset, int length) throws IOException {
|
||||
int read = 0;
|
||||
|
||||
for (int i = offset; i < offset + length; i++) {
|
||||
int c = read();
|
||||
if (c == -1) {
|
||||
break;
|
||||
}
|
||||
buffer[i] = (char) c;
|
||||
read++;
|
||||
}
|
||||
|
||||
return read == 0 ? -1 : read;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
int ic = buffer.get(bufferPosition);
|
||||
|
||||
// End of input
|
||||
if (ic == -1) {
|
||||
buffer.freeBefore(bufferPosition);
|
||||
return ic;
|
||||
}
|
||||
|
||||
char c = (char) ic;
|
||||
|
||||
// Skip surrogate pair characters
|
||||
if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
|
||||
iterationMarkSpanEndPosition = bufferPosition + 1;
|
||||
}
|
||||
|
||||
// Free rolling buffer on full stop
|
||||
if (c == FULL_STOP_PUNCTUATION) {
|
||||
buffer.freeBefore(bufferPosition);
|
||||
iterationMarkSpanEndPosition = bufferPosition + 1;
|
||||
}
|
||||
|
||||
// Normalize iteration mark
|
||||
if (isIterationMark(c)) {
|
||||
c = normalizeIterationMark(c);
|
||||
}
|
||||
|
||||
bufferPosition++;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes the iteration mark character c
|
||||
*
|
||||
* @param c iteration mark character to normalize
|
||||
* @return normalized iteration mark
|
||||
* @throws IOException
|
||||
*/
|
||||
private char normalizeIterationMark(char c) throws IOException {
|
||||
|
||||
// Case 1: Inside an iteration mark span
|
||||
if (bufferPosition < iterationMarkSpanEndPosition) {
|
||||
return normalize(sourceCharacter(bufferPosition, iterationMarksSpanSize), c);
|
||||
}
|
||||
|
||||
// Case 2: New iteration mark spans starts where the previous one ended, which is illegal
|
||||
if (bufferPosition == iterationMarkSpanEndPosition) {
|
||||
// Emit the illegal iteration mark and increase end position to indicate that we can't
|
||||
// start a new span on the next position either
|
||||
iterationMarkSpanEndPosition++;
|
||||
return c;
|
||||
}
|
||||
|
||||
// Case 3: New iteration mark span
|
||||
iterationMarksSpanSize = nextIterationMarkSpanSize();
|
||||
iterationMarkSpanEndPosition = bufferPosition + iterationMarksSpanSize;
|
||||
return normalize(sourceCharacter(bufferPosition, iterationMarksSpanSize), c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the number of subsequent next iteration marks
|
||||
*
|
||||
* @return number of iteration marks starting at the current buffer position
|
||||
* @throws IOException
|
||||
*/
|
||||
private int nextIterationMarkSpanSize() throws IOException {
|
||||
int spanSize = 0;
|
||||
for (int i = bufferPosition; buffer.get(i) != -1 && isIterationMark((char) (buffer.get(i))); i++) {
|
||||
spanSize++;
|
||||
}
|
||||
// Restrict span size so that we don't go past the previous end position
|
||||
if (bufferPosition - spanSize < iterationMarkSpanEndPosition) {
|
||||
spanSize = bufferPosition - iterationMarkSpanEndPosition;
|
||||
}
|
||||
return spanSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the source character for a given position and iteration mark span size
|
||||
*
|
||||
* @param position buffer position (should not exceed bufferPosition)
|
||||
* @param spanSize iteration mark span size
|
||||
* @return source character
|
||||
* @throws IOException
|
||||
*/
|
||||
private char sourceCharacter(int position, int spanSize) throws IOException {
|
||||
return (char) buffer.get(position - spanSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a character
|
||||
*
|
||||
* @param c character to normalize
|
||||
* @param m repetition mark referring to c
|
||||
* @return normalized character - return c on illegal iteration marks
|
||||
*/
|
||||
private char normalize(char c, char m) {
|
||||
if (isHiraganaIterationMark(m)) {
|
||||
return normalizedHiragana(c, m);
|
||||
}
|
||||
|
||||
if (isKatakanaIterationMark(m)) {
|
||||
return normalizedKatakana(c, m);
|
||||
}
|
||||
|
||||
return c; // If m is not kana and we are to normalize it, we assume it is kanji and simply return it
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize hiragana character
|
||||
*
|
||||
* @param c hiragana character
|
||||
* @param m repetition mark referring to c
|
||||
* @return normalized character - return c on illegal iteration marks
|
||||
*/
|
||||
private char normalizedHiragana(char c, char m) {
|
||||
switch (m) {
|
||||
case HIRAGANA_ITERATION_MARK:
|
||||
return isHiraganaDakuten(c) ? (char) (c - 1) : c;
|
||||
case HIRAGANA_VOICED_ITERATION_MARK:
|
||||
return lookupHiraganaDakuten(c);
|
||||
default:
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize katakana character
|
||||
*
|
||||
* @param c katakana character
|
||||
* @param m repetition mark referring to c
|
||||
* @return normalized character - return c on illegal iteration marks
|
||||
*/
|
||||
private char normalizedKatakana(char c, char m) {
|
||||
switch (m) {
|
||||
case KATAKANA_ITERATION_MARK:
|
||||
return isKatakanaDakuten(c) ? (char) (c - 1) : c;
|
||||
case KATAKANA_VOICED_ITERATION_MARK:
|
||||
return lookupKatakanaDakuten(c);
|
||||
default:
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iteration mark character predicate
|
||||
*
|
||||
* @param c character to test
|
||||
* @return true if c is an iteration mark character. Otherwise false.
|
||||
*/
|
||||
private boolean isIterationMark(char c) {
|
||||
return isKanjiIterationMark(c) || isHiraganaIterationMark(c) || isKatakanaIterationMark(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hiragana iteration mark character predicate
|
||||
*
|
||||
* @param c character to test
|
||||
* @return true if c is a hiragana iteration mark character. Otherwise false.
|
||||
*/
|
||||
private boolean isHiraganaIterationMark(char c) {
|
||||
if (normalizeKana) {
|
||||
return c == HIRAGANA_ITERATION_MARK || c == HIRAGANA_VOICED_ITERATION_MARK;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Katakana iteration mark character predicate
|
||||
*
|
||||
* @param c character to test
|
||||
* @return true if c is a katakana iteration mark character. Otherwise false.
|
||||
*/
|
||||
private boolean isKatakanaIterationMark(char c) {
|
||||
if (normalizeKana) {
|
||||
return c == KATAKANA_ITERATION_MARK || c == KATAKANA_VOICED_ITERATION_MARK;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Kanji iteration mark character predicate
|
||||
*
|
||||
* @param c character to test
|
||||
* @return true if c is a kanji iteration mark character. Otherwise false.
|
||||
*/
|
||||
private boolean isKanjiIterationMark(char c) {
|
||||
if (normalizeKanji) {
|
||||
return c == KANJI_ITERATION_MARK;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Look up hiragana dakuten
|
||||
*
|
||||
* @param c character to look up
|
||||
* @return hiragana dakuten variant of c or c itself if no dakuten variant exists
|
||||
*/
|
||||
private char lookupHiraganaDakuten(char c) {
|
||||
return lookup(c, h2d, '\u304b'); // Code point is for か
|
||||
}
|
||||
|
||||
/**
|
||||
* Look up katakana dakuten. Only full-width katakana are supported.
|
||||
*
|
||||
* @param c character to look up
|
||||
* @return katakana dakuten variant of c or c itself if no dakuten variant exists
|
||||
*/
|
||||
private char lookupKatakanaDakuten(char c) {
|
||||
return lookup(c, k2d, '\u30ab'); // Code point is for カ
|
||||
}
|
||||
|
||||
/**
|
||||
* Hiragana dakuten predicate
|
||||
*
|
||||
* @param c character to check
|
||||
* @return true if c is a hiragana dakuten and otherwise false
|
||||
*/
|
||||
private boolean isHiraganaDakuten(char c) {
|
||||
return inside(c, h2d, '\u304b') && c == lookupHiraganaDakuten(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Katakana dakuten predicate
|
||||
*
|
||||
* @param c character to check
|
||||
* @return true if c is a hiragana dakuten and otherwise false
|
||||
*/
|
||||
private boolean isKatakanaDakuten(char c) {
|
||||
return inside(c, k2d, '\u30ab') && c == lookupKatakanaDakuten(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up a character in dakuten map and returns the dakuten variant if it exists.
|
||||
* Otherwise return the character being looked up itself
|
||||
*
|
||||
* @param c character to look up
|
||||
* @param map dakuten map
|
||||
* @param offset code point offset from c
|
||||
* @return mapped character or c if no mapping exists
|
||||
*/
|
||||
private char lookup(char c, char[] map, char offset) {
|
||||
if (!inside(c, map, offset)) {
|
||||
return c;
|
||||
} else {
|
||||
return map[c - offset];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Predicate indicating if the lookup character is within dakuten map range
|
||||
*
|
||||
* @param c character to look up
|
||||
* @param map dakuten map
|
||||
* @param offset code point offset from c
|
||||
* @return true if c is mapped by map and otherwise false
|
||||
*/
|
||||
private boolean inside(char c, char[] map, char offset) {
|
||||
return c >= offset && c < offset + map.length;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,222 @@
|
|||
package org.apache.lucene.analysis.ja;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.charfilter.CharFilter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class TestJapaneseIterationMarkCharFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
private Analyzer keywordAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new JapaneseIterationMarkCharFilter(CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
|
||||
private Analyzer japaneseAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(reader, null, false, JapaneseTokenizer.Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new JapaneseIterationMarkCharFilter(CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
|
||||
public void testKanji() throws IOException {
|
||||
// Test single repetition
|
||||
assertAnalyzesTo(keywordAnalyzer, "時々", new String[]{"時時"});
|
||||
assertAnalyzesTo(japaneseAnalyzer, "時々", new String[]{"時時"});
|
||||
|
||||
// Test multiple repetitions
|
||||
assertAnalyzesTo(keywordAnalyzer, "馬鹿々々しい", new String[]{"馬鹿馬鹿しい"});
|
||||
assertAnalyzesTo(japaneseAnalyzer, "馬鹿々々しい", new String[]{"馬鹿馬鹿しい"});
|
||||
}
|
||||
|
||||
public void testKatakana() throws IOException {
|
||||
// Test single repetition
|
||||
assertAnalyzesTo(keywordAnalyzer, "ミスヾ", new String[]{"ミスズ"});
|
||||
assertAnalyzesTo(japaneseAnalyzer, "ミスヾ", new String[]{"ミ", "スズ"}); // Side effect
|
||||
}
|
||||
|
||||
public void testHiragana() throws IOException {
|
||||
// Test single unvoiced iteration
|
||||
assertAnalyzesTo(keywordAnalyzer, "おゝの", new String[]{"おおの"});
|
||||
assertAnalyzesTo(japaneseAnalyzer, "おゝの", new String[]{"お", "おの"}); // Side effect
|
||||
|
||||
// Test single voiced iteration
|
||||
assertAnalyzesTo(keywordAnalyzer, "みすゞ", new String[]{"みすず"});
|
||||
assertAnalyzesTo(japaneseAnalyzer, "みすゞ", new String[]{"みすず"});
|
||||
|
||||
// Test single voiced iteration
|
||||
assertAnalyzesTo(keywordAnalyzer, "じゞ", new String[]{"じじ"});
|
||||
assertAnalyzesTo(japaneseAnalyzer, "じゞ", new String[]{"じじ"});
|
||||
|
||||
// Test single unvoiced iteration with voiced iteration
|
||||
assertAnalyzesTo(keywordAnalyzer, "じゝ", new String[]{"じし"});
|
||||
assertAnalyzesTo(japaneseAnalyzer, "じゝ", new String[]{"じし"});
|
||||
|
||||
// Test multiple repetitions with voiced iteration
|
||||
assertAnalyzesTo(keywordAnalyzer, "ところゞゝゝ", new String[]{"ところどころ"});
|
||||
assertAnalyzesTo(japaneseAnalyzer, "ところゞゝゝ", new String[]{"ところどころ"});
|
||||
}
|
||||
|
||||
public void testMalformed() throws IOException {
|
||||
// We can't iterate c here, so emit as it is
|
||||
assertAnalyzesTo(keywordAnalyzer, "abcところゝゝゝゝ", new String[]{"abcところcところ"});
|
||||
|
||||
// We can't iterate c (with dakuten change) here, so emit it as-is
|
||||
assertAnalyzesTo(keywordAnalyzer, "abcところゞゝゝゝ", new String[]{"abcところcところ"});
|
||||
|
||||
// We can't iterate before beginning of stream, so emit characters as-is
|
||||
assertAnalyzesTo(keywordAnalyzer, "ところゞゝゝゞゝゞ", new String[]{"ところどころゞゝゞ"});
|
||||
|
||||
// We can't iterate an iteration mark only, so emit as-is
|
||||
assertAnalyzesTo(keywordAnalyzer, "々", new String[]{"々"});
|
||||
assertAnalyzesTo(keywordAnalyzer, "ゞ", new String[]{"ゞ"});
|
||||
assertAnalyzesTo(keywordAnalyzer, "ゞゝ", new String[]{"ゞゝ"});
|
||||
|
||||
// We can't iterate a full stop punctuation mark (because we use it as a flush marker)
|
||||
assertAnalyzesTo(keywordAnalyzer, "。ゝ", new String[]{"。ゝ"});
|
||||
assertAnalyzesTo(keywordAnalyzer, "。。ゝゝ", new String[]{"。。ゝゝ"});
|
||||
|
||||
// We can iterate other punctuation marks
|
||||
assertAnalyzesTo(keywordAnalyzer, "?ゝ", new String[]{"??"});
|
||||
|
||||
// We can not get a dakuten variant of ぽ -- this is also a corner case test for inside()
|
||||
assertAnalyzesTo(keywordAnalyzer, "ねやぽゞつむぴ", new String[]{"ねやぽぽつむぴ"});
|
||||
assertAnalyzesTo(keywordAnalyzer, "ねやぽゝつむぴ", new String[]{"ねやぽぽつむぴ"});
|
||||
}
|
||||
|
||||
public void testEmpty() throws IOException {
|
||||
// Empty input stays empty
|
||||
assertAnalyzesTo(keywordAnalyzer, "", new String[0]);
|
||||
assertAnalyzesTo(japaneseAnalyzer, "", new String[0]);
|
||||
}
|
||||
|
||||
public void testFullStop() throws IOException {
|
||||
// Test full stops
|
||||
assertAnalyzesTo(keywordAnalyzer, "。", new String[]{"。"});
|
||||
assertAnalyzesTo(keywordAnalyzer, "。。", new String[]{"。。"});
|
||||
assertAnalyzesTo(keywordAnalyzer, "。。。", new String[]{"。。。"});
|
||||
}
|
||||
|
||||
public void testKanjiOnly() throws IOException {
|
||||
// Test kanji only repetition marks
|
||||
CharFilter filter = new JapaneseIterationMarkCharFilter(
|
||||
CharReader.get(new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。")),
|
||||
true, // kanji
|
||||
false // no kana
|
||||
);
|
||||
assertCharFilterEquals(filter, "時時、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
|
||||
}
|
||||
|
||||
public void testKanaOnly() throws IOException {
|
||||
// Test kana only repetition marks
|
||||
CharFilter filter = new JapaneseIterationMarkCharFilter(
|
||||
CharReader.get(new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。")),
|
||||
false, // no kanji
|
||||
true // kana
|
||||
);
|
||||
assertCharFilterEquals(filter, "時々、おおのさんと一緒にお寿司が食べたいです。abcところどころ。");
|
||||
}
|
||||
|
||||
public void testNone() throws IOException {
|
||||
// Test no repetition marks
|
||||
CharFilter filter = new JapaneseIterationMarkCharFilter(
|
||||
CharReader.get(new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。")),
|
||||
false, // no kanji
|
||||
false // no kana
|
||||
);
|
||||
assertCharFilterEquals(filter, "時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
|
||||
}
|
||||
|
||||
public void testCombinations() throws IOException {
|
||||
assertAnalyzesTo(keywordAnalyzer, "時々、おゝのさんと一緒にお寿司を食べに行きます。",
|
||||
new String[]{"時時、おおのさんと一緒にお寿司を食べに行きます。"}
|
||||
);
|
||||
}
|
||||
|
||||
public void testHiraganaCoverage() throws IOException {
|
||||
// Test all hiragana iteration variants
|
||||
String source = "かゝがゝきゝぎゝくゝぐゝけゝげゝこゝごゝさゝざゝしゝじゝすゝずゝせゝぜゝそゝぞゝたゝだゝちゝぢゝつゝづゝてゝでゝとゝどゝはゝばゝひゝびゝふゝぶゝへゝべゝほゝぼゝ";
|
||||
String target = "かかがかききぎきくくぐくけけげけここごこささざさししじしすすずすせせぜせそそぞそたただたちちぢちつつづつててでてととどとははばはひひびひふふぶふへへべへほほぼほ";
|
||||
assertAnalyzesTo(keywordAnalyzer, source, new String[]{target});
|
||||
|
||||
// Test all hiragana iteration variants with dakuten
|
||||
source = "かゞがゞきゞぎゞくゞぐゞけゞげゞこゞごゞさゞざゞしゞじゞすゞずゞせゞぜゞそゞぞゞたゞだゞちゞぢゞつゞづゞてゞでゞとゞどゞはゞばゞひゞびゞふゞぶゞへゞべゞほゞぼゞ";
|
||||
target = "かがががきぎぎぎくぐぐぐけげげげこごごごさざざざしじじじすずずずせぜぜぜそぞぞぞただだだちぢぢぢつづづづてでででとどどどはばばばひびびびふぶぶぶへべべべほぼぼぼ";
|
||||
assertAnalyzesTo(keywordAnalyzer, source, new String[]{target});
|
||||
}
|
||||
|
||||
public void testKatakanaCoverage() throws IOException {
|
||||
// Test all katakana iteration variants
|
||||
String source = "カヽガヽキヽギヽクヽグヽケヽゲヽコヽゴヽサヽザヽシヽジヽスヽズヽセヽゼヽソヽゾヽタヽダヽチヽヂヽツヽヅヽテヽデヽトヽドヽハヽバヽヒヽビヽフヽブヽヘヽベヽホヽボヽ";
|
||||
String target = "カカガカキキギキククグクケケゲケココゴコササザサシシジシススズスセセゼセソソゾソタタダタチチヂチツツヅツテテデテトトドトハハバハヒヒビヒフフブフヘヘベヘホホボホ";
|
||||
assertAnalyzesTo(keywordAnalyzer, source, new String[]{target});
|
||||
|
||||
// Test all katakana iteration variants with dakuten
|
||||
source = "カヾガヾキヾギヾクヾグヾケヾゲヾコヾゴヾサヾザヾシヾジヾスヾズヾセヾゼヾソヾゾヾタヾダヾチヾヂヾツヾヅヾテヾデヾトヾドヾハヾバヾヒヾビヾフヾブヾヘヾベヾホヾボヾ";
|
||||
target = "カガガガキギギギクグググケゲゲゲコゴゴゴサザザザシジジジスズズズセゼゼゼソゾゾゾタダダダチヂヂヂツヅヅヅテデデデトドドドハバババヒビビビフブブブヘベベベホボボボ";
|
||||
assertAnalyzesTo(keywordAnalyzer, source, new String[]{target});
|
||||
}
|
||||
|
||||
public void testRandomStrings() throws Exception {
|
||||
// Blast some random strings through
|
||||
checkRandomData(random(), keywordAnalyzer, 1000 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
// Blast some random strings through
|
||||
checkRandomData(random(), keywordAnalyzer, 100 * RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
|
||||
private void assertCharFilterEquals(CharFilter filter, String expected) throws IOException {
|
||||
String actual = readFully(filter);
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
|
||||
private String readFully(CharStream stream) throws IOException {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
int ch;
|
||||
while ((ch = stream.read()) != -1) {
|
||||
buffer.append((char) ch);
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -28,6 +28,9 @@ $Id$
|
|||
|
||||
New Features
|
||||
|
||||
* LUCENE-4201: Added JapaneseIterationMarkCharFilterFactory to normalize Japanese
|
||||
iteration marks. (Robert Muir, Christian Moen)
|
||||
|
||||
* SOLR-1856: In Solr Cell, literals should override Tika-parsed values.
|
||||
Patch adds a param "literalsOverride" which defaults to true, but can be set
|
||||
to "false" to let Tika-parsed values be appended to literal values (Chris Harris, janhoy)
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
|
||||
* <analyzer>
|
||||
* <charFilter class="solr.JapaneseIterationMarkCharFilterFactory normalizeKanji="true" normalizeKana="true"/>
|
||||
* <tokenizer class="solr.JapaneseTokenizerFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class JapaneseIterationMarkCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
private static final String NORMALIZE_KANJI_PARAM = "normalizeKanji";
|
||||
|
||||
private static final String NORMALIZE_KANA_PARAM = "normalizeKana";
|
||||
|
||||
private boolean normalizeKanji = true;
|
||||
|
||||
private boolean normalizeKana = true;
|
||||
|
||||
@Override
|
||||
public CharStream create(CharStream input) {
|
||||
return new JapaneseIterationMarkCharFilter(input, normalizeKanji, normalizeKana);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
normalizeKanji = getBoolean(NORMALIZE_KANJI_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT);
|
||||
normalizeKana = getBoolean(NORMALIZE_KANA_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Simple tests for {@link org.apache.solr.analysis.JapaneseIterationMarkCharFilterFactory}
|
||||
*/
|
||||
public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testIterationMarksWithKeywordTokenizer() throws IOException {
|
||||
final String text = "時々馬鹿々々しいところゞゝゝミスヾ";
|
||||
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
|
||||
CharStream filter = filterFactory.create(CharReader.get(new StringReader(text)));
|
||||
TokenStream tokenStream = new MockTokenizer(filter, MockTokenizer.KEYWORD, false);
|
||||
assertTokenStreamContents(tokenStream, new String[]{"時時馬鹿馬鹿しいところどころミスズ"});
|
||||
}
|
||||
|
||||
public void testIterationMarksWithJapaneseTokenizer() throws IOException {
|
||||
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
|
||||
Map<String, String> tokenizerArgs = Collections.emptyMap();
|
||||
tokenizerFactory.init(tokenizerArgs);
|
||||
tokenizerFactory.inform(new SolrResourceLoader(null, null));
|
||||
|
||||
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
|
||||
Map<String, String> filterArgs = Collections.emptyMap();
|
||||
filterFactory.init(filterArgs);
|
||||
|
||||
CharStream filter = filterFactory.create(
|
||||
CharReader.get(new StringReader("時々馬鹿々々しいところゞゝゝミスヾ"))
|
||||
);
|
||||
TokenStream tokenStream = tokenizerFactory.create(filter);
|
||||
assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ"});
|
||||
}
|
||||
|
||||
public void testKanjiOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
|
||||
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
|
||||
Map<String, String> tokenizerArgs = Collections.emptyMap();
|
||||
tokenizerFactory.init(tokenizerArgs);
|
||||
tokenizerFactory.inform(new SolrResourceLoader(null, null));
|
||||
|
||||
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
|
||||
Map<String, String> filterArgs = new HashMap<String, String>();
|
||||
filterArgs.put("normalizeKanji", "true");
|
||||
filterArgs.put("normalizeKana", "false");
|
||||
filterFactory.init(filterArgs);
|
||||
|
||||
CharStream filter = filterFactory.create(
|
||||
CharReader.get(new StringReader("時々馬鹿々々しいところゞゝゝミスヾ"))
|
||||
);
|
||||
TokenStream tokenStream = tokenizerFactory.create(filter);
|
||||
assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ"});
|
||||
}
|
||||
|
||||
public void testKanaOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
|
||||
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
|
||||
Map<String, String> tokenizerArgs = Collections.emptyMap();
|
||||
tokenizerFactory.init(tokenizerArgs);
|
||||
tokenizerFactory.inform(new SolrResourceLoader(null, null));
|
||||
|
||||
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
|
||||
Map<String, String> filterArgs = new HashMap<String, String>();
|
||||
filterArgs.put("normalizeKanji", "false");
|
||||
filterArgs.put("normalizeKana", "true");
|
||||
filterFactory.init(filterArgs);
|
||||
|
||||
CharStream filter = filterFactory.create(
|
||||
CharReader.get(new StringReader("時々馬鹿々々しいところゞゝゝミスヾ"))
|
||||
);
|
||||
TokenStream tokenStream = tokenizerFactory.create(filter);
|
||||
assertTokenStreamContents(tokenStream, new String[]{"時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ"});
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue