* <fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100"> * <analyzer> - * <tokenizer class="solr.ICUTokenizerFactory" + * <tokenizer class="solr.ICUTokenizerFactory" cjkAsWords="true" * rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/> * </analyzer> * </fieldType>@@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa static final String RULEFILES = "rulefiles"; private final Map
- * This breaks Lao text into syllables according to: - * Syllabification of Lao Script for Line Breaking - * Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, - * Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP. - *
- * Most work is accomplished with RBBI rules, however some additional special logic is needed - * that cannot be coded in a grammar, and this is implemented here. - *
- * For example, what appears to be a final consonant might instead be part of the next syllable. - * Rules match in a greedy fashion, leaving an illegal sequence that matches no rules. - *
- * Take for instance the text ກວ່າດອກ - * The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal. - * What LaoBreakIterator does, according to the paper: - *
- * Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
- * This is the issue of combining marks being in the wrong order (typos).
- * @lucene.experimental
- */
-public class LaoBreakIterator extends BreakIterator {
- RuleBasedBreakIterator rules;
- CharArrayIterator text;
-
- CharArrayIterator working = new CharArrayIterator();
- int workingOffset = 0;
-
- CharArrayIterator verifyText = new CharArrayIterator();
- RuleBasedBreakIterator verify;
-
- private static final UnicodeSet laoSet;
- static {
- laoSet = new UnicodeSet("[:Lao:]");
- laoSet.compact();
- laoSet.freeze();
- }
-
- /**
- * Creates a new iterator, performing the backtracking verification
- * across the provided rules
.
- */
- public LaoBreakIterator(RuleBasedBreakIterator rules) {
- this.rules = (RuleBasedBreakIterator) rules.clone();
- this.verify = (RuleBasedBreakIterator) rules.clone();
- }
-
- @Override
- public int current() {
- int current = rules.current();
- return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
- }
-
- @Override
- public int first() {
- working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
- rules.setText(working);
- workingOffset = 0;
- int first = rules.first();
- return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
- }
-
- @Override
- public int following(int offset) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public CharacterIterator getText() {
- return text;
- }
-
- @Override
- public int last() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public int next() {
- int current = current();
- int next = rules.next();
- if (next == BreakIterator.DONE)
- return next;
- else
- next += workingOffset;
-
- char c = working.current();
- int following = rules.next(); // lookahead
- if (following != BreakIterator.DONE) {
- following += workingOffset;
- if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
- workingOffset = next - 1;
- working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
- return next - 1;
- }
- rules.previous(); // undo the lookahead
- }
-
- return next;
- }
-
- @Override
- public int next(int n) {
- if (n < 0)
- throw new UnsupportedOperationException("Backwards traversal is unsupported");
-
- int result = current();
- while (n > 0) {
- result = next();
- --n;
- }
- return result;
- }
-
- @Override
- public int previous() {
- throw new UnsupportedOperationException("Backwards traversal is unsupported");
- }
-
- @Override
- public void setText(CharacterIterator text) {
- if (!(text instanceof CharArrayIterator))
- throw new UnsupportedOperationException("unsupported CharacterIterator");
- this.text = (CharArrayIterator) text;
- ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
- working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
- rules.setText(working);
- workingOffset = 0;
- }
-
- @Override
- public void setText(String newText) {
- CharArrayIterator ci = new CharArrayIterator();
- ci.setText(newText.toCharArray(), 0, newText.length());
- setText(ci);
- }
-
- private boolean verifyPushBack(int current, int next) {
- int shortenedSyllable = next - current - 1;
-
- verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
- verify.setText(verifyText);
- if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
- return false;
-
-
- verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
- verify.setText(verifyText);
-
- return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
- }
-
- // TODO: only bubblesort around runs of combining marks, instead of the entire text.
- private void ccReorder(char[] text, int start, int length) {
- boolean reordered;
- do {
- int prevCC = 0;
- reordered = false;
- for (int i = start; i < start + length; i++) {
- final char c = text[i];
- final int cc = UCharacter.getCombiningClass(c);
- if (cc > 0 && cc < prevCC) {
- // swap
- text[i] = text[i - 1];
- text[i - 1] = c;
- reordered = true;
- } else {
- prevCC = cc;
- }
- }
-
- } while (reordered == true);
- }
-
- /**
- * Clone method. Creates another LaoBreakIterator with the same behavior
- * and current state as this one.
- * @return The clone.
- */
- @Override
- public LaoBreakIterator clone() {
- LaoBreakIterator other = (LaoBreakIterator) super.clone();
- other.rules = (RuleBasedBreakIterator) rules.clone();
- other.verify = (RuleBasedBreakIterator) verify.clone();
- if (text != null)
- other.text = text.clone();
- if (working != null)
- other.working = working.clone();
- if (verifyText != null)
- other.verifyText = verifyText.clone();
- return other;
- }
-}
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
index 779dc9ba404..f573b192bce 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
@@ -59,6 +59,15 @@ final class ScriptIterator {
private int scriptStart;
private int scriptLimit;
private int scriptCode;
+
+ private final boolean combineCJ;
+
+ /**
+ * @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
+ */
+ ScriptIterator(boolean combineCJ) {
+ this.combineCJ = combineCJ;
+ }
/**
* Get the start of this script run
@@ -162,10 +171,24 @@ final class ScriptIterator {
}
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
- private static int getScript(int codepoint) {
- if (0 <= codepoint && codepoint < basicLatin.length)
+ private int getScript(int codepoint) {
+ if (0 <= codepoint && codepoint < basicLatin.length) {
return basicLatin[codepoint];
- else
- return UScript.getScript(codepoint);
+ } else {
+ int script = UScript.getScript(codepoint);
+ if (combineCJ) {
+ if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
+ return UScript.JAPANESE;
+ } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
+ // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
+ // they are treated as punctuation. we currently have no cleaner way to fix this!
+ return UScript.LATIN;
+ } else {
+ return script;
+ }
+ } else {
+ return script;
+ }
+ }
}
}
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
index 9e5ac475979..e9d911964da 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
@@ -84,6 +84,10 @@ public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribut
@Override
public void reflectWith(AttributeReflector reflector) {
- reflector.reflect(ScriptAttribute.class, "script", getName());
+ // when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to
+ // mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset),
+ // but this is just to help prevent confusion.
+ String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName();
+ reflector.reflect(ScriptAttribute.class, "script", name);
}
}
diff --git a/lucene/analysis/icu/src/java/overview.html b/lucene/analysis/icu/src/java/overview.html
index a379f55963e..5411a4fcaee 100644
--- a/lucene/analysis/icu/src/java/overview.html
+++ b/lucene/analysis/icu/src/java/overview.html
@@ -14,6 +14,7 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
+
Collator collator = Collator.getInstance(new ULocale("ar")); - ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator); + ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_50, collator); RAMDirectory ramDir = new RAMDirectory(); - IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer)); + IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_50, analyzer)); Document doc = new Document(); doc.add(new Field("content", "\u0633\u0627\u0628", Field.Store.YES, Field.Index.ANALYZED)); @@ -124,7 +125,7 @@ algorithm. writer.close(); IndexSearcher is = new IndexSearcher(ramDir, true); - QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer); + QueryParser aqp = new QueryParser(Version.LUCENE_50, "content", analyzer); aqp.setAnalyzeRangeTerms(true); // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi @@ -140,9 +141,9 @@ algorithm.Danish Sorting
Analyzer analyzer - = new ICUCollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new ULocale("da", "dk"))); + = new ICUCollationKeyAnalyzer(Version.LUCENE_50, Collator.getInstance(new ULocale("da", "dk"))); RAMDirectory indexStore = new RAMDirectory(); - IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer)); + IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_50, analyzer)); String[] tracer = new String[] { "A", "B", "C", "D", "E" }; String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" }; String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" }; @@ -168,15 +169,15 @@ algorithm.Collator collator = Collator.getInstance(new ULocale("tr", "TR")); collator.setStrength(Collator.PRIMARY); - Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator); + Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_50, collator); RAMDirectory ramDir = new RAMDirectory(); - IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer)); + IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_50, analyzer)); Document doc = new Document(); doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc); writer.close(); IndexSearcher is = new IndexSearcher(ramDir, true); - QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer); + QueryParser parser = new QueryParser(Version.LUCENE_50, "contents", analyzer); Query query = parser.parse("d\u0131gy"); // U+0131: dotless i ScoreDoc[] result = is.search(query, null, 1000).scoreDocs; assertEquals("The index Term should be included.", 1, result.length); @@ -353,7 +354,7 @@ andBackwards Compatibility
This module exists to provide up-to-date Unicode functionality that supports -the most recent version of Unicode (currently 6.1). However, some users who wish +the most recent version of Unicode (currently 6.3). However, some users who wish for stronger backwards compatibility can restrict {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk index 3972d1cd7d4..e4b35d24e80 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk deleted file mode 100644 index 5a6666466a7..00000000000 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk and /dev/null differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk index f5b50e14e76..dd368d05ec2 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk deleted file mode 100644 index 571b0163441..00000000000 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk and /dev/null differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk index 1bab7a616ef..dcaeb571789 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm index 6e85a18dbf1..efbbb9e490c 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm differ diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java index a7c02688b54..1d9a901fb1e 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java @@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); - ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input)); + ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false)); assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); } @@ -52,7 +52,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { sb.append('a'); } String input = sb.toString(); - ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input)); + ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false)); char token[] = new char[4096]; Arrays.fill(token, 'a'); String expectedToken = new String(token); @@ -69,7 +69,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new ICUTokenizer(reader); + Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false)); TokenFilter filter = new ICUNormalizer2Filter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } @@ -118,6 +118,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testLao() throws Exception { assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" }); + assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "
", " " }); } public void testThai() throws Exception { @@ -138,6 +139,13 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { new String[] { "我", "是", "中", "国", "人", "1234", "tests"}); } + public void testHebrew() throws Exception { + assertAnalyzesTo(a, "דנקנר תקף את הדו\"ח", + new String[] { "דנקנר", "תקף", "את", "הדו\"ח" }); + assertAnalyzesTo(a, "חברת בת של מודי'ס", + new String[] { "חברת", "בת", "של", "מודי'ס" }); + } + public void testEmpty() throws Exception { assertAnalyzesTo(a, "", new String[] {}); assertAnalyzesTo(a, ".", new String[] {}); diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java new file mode 100644 index 00000000000..2e60717d064 --- /dev/null +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java @@ -0,0 +1,91 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.util.Random; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; + +/** + * test ICUTokenizer with dictionary-based CJ segmentation + */ +public class TestICUTokenizerCJK extends BaseTokenStreamTestCase { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new TokenStreamComponents(new ICUTokenizer(reader)); + } + }; + + /** + * test stolen from smartcn + */ + public void testSimpleChinese() throws Exception { + assertAnalyzesTo(a, "我购买了道具和服装。", + new String[] { "我", "购买", "了", "道具", "和", "服装" } + ); + } + + public void testChineseNumerics() throws Exception { + assertAnalyzesTo(a, "9483", new String[] { "9483" }); + assertAnalyzesTo(a, "院內分機9483。", + new String[] { "院", "內", "分機", "9483" }); + assertAnalyzesTo(a, "院內分機9483。", + new String[] { "院", "內", "分機", "9483" }); + } + + /** + * test stolen from kuromoji + */ + public void testSimpleJapanese() throws Exception { + assertAnalyzesTo(a, "それはまだ実験段階にあります", + new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" } + ); + } + + public void testJapaneseTypes() throws Exception { + assertAnalyzesTo(a, "仮名遣い カタカナ", + new String[] { "仮名遣い", "カタカナ" }, + new String[] { " ", " " }); + } + + public void testKorean() throws Exception { + // Korean words + assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"}); + } + + /** make sure that we still tag korean as HANGUL (for further decomposition/ngram/whatever) */ + public void testKoreanTypes() throws Exception { + assertAnalyzesTo(a, "훈민정음", + new String[] { "훈민정음" }, + new String[] { "