mirror of https://github.com/apache/lucene.git
LUCENE-3063: factor CharTokenizer/CharacterUtils into analyzers module
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1098871 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e5256e71e2
commit
4455345c6e
|
@ -312,6 +312,8 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
|
|||
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
|
||||
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
|
||||
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
|
||||
- o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer
|
||||
- o.a.l.util.CharacterUtils -> o.a.l.analysis.util.CharacterUtils
|
||||
|
||||
* LUCENE-2514: The option to use a Collator's order (instead of binary order) for
|
||||
sorting and range queries has been moved to contrib/queries.
|
||||
|
|
|
@ -20,14 +20,15 @@ package org.apache.lucene.analysis;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
/**
|
||||
* Automaton-based tokenizer for testing. Optionally lowercases.
|
||||
*/
|
||||
public class MockTokenizer extends CharTokenizer {
|
||||
public class MockTokenizer extends Tokenizer {
|
||||
/** Acts Similar to WhitespaceTokenizer */
|
||||
public static final CharacterRunAutomaton WHITESPACE =
|
||||
new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").toAutomaton());
|
||||
|
@ -45,21 +46,67 @@ public class MockTokenizer extends CharTokenizer {
|
|||
private final boolean lowerCase;
|
||||
private int state;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
int off = 0;
|
||||
|
||||
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
|
||||
super(LuceneTestCase.TEST_VERSION_CURRENT, factory, input);
|
||||
super(factory, input);
|
||||
this.runAutomaton = runAutomaton;
|
||||
this.lowerCase = lowerCase;
|
||||
this.state = runAutomaton.getInitialState();
|
||||
}
|
||||
|
||||
public MockTokenizer(Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
|
||||
super(LuceneTestCase.TEST_VERSION_CURRENT, input);
|
||||
super(input);
|
||||
this.runAutomaton = runAutomaton;
|
||||
this.lowerCase = lowerCase;
|
||||
this.state = runAutomaton.getInitialState();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
for (;;) {
|
||||
int startOffset = off;
|
||||
int cp = readCodePoint();
|
||||
if (cp < 0) {
|
||||
break;
|
||||
} else if (isTokenChar(cp)) {
|
||||
int endOffset;
|
||||
do {
|
||||
char chars[] = Character.toChars(normalize(cp));
|
||||
for (int i = 0; i < chars.length; i++)
|
||||
termAtt.append(chars[i]);
|
||||
endOffset = off;
|
||||
cp = readCodePoint();
|
||||
} while (cp >= 0 && isTokenChar(cp));
|
||||
offsetAtt.setOffset(startOffset, endOffset);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
protected int readCodePoint() throws IOException {
|
||||
int ch = input.read();
|
||||
if (ch < 0) {
|
||||
return ch;
|
||||
} else {
|
||||
assert !Character.isLowSurrogate((char) ch);
|
||||
off++;
|
||||
if (Character.isHighSurrogate((char) ch)) {
|
||||
int ch2 = input.read();
|
||||
if (ch2 >= 0) {
|
||||
off++;
|
||||
assert Character.isLowSurrogate((char) ch2);
|
||||
return Character.toCodePoint((char) ch, (char) ch2);
|
||||
}
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean isTokenChar(int c) {
|
||||
state = runAutomaton.step(state, c);
|
||||
if (state < 0) {
|
||||
|
@ -70,7 +117,6 @@ public class MockTokenizer extends CharTokenizer {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int normalize(int c) {
|
||||
return lowerCase ? Character.toLowerCase(c) : c;
|
||||
}
|
||||
|
@ -79,5 +125,12 @@ public class MockTokenizer extends CharTokenizer {
|
|||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
state = runAutomaton.getInitialState();
|
||||
off = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
int finalOffset = correctOffset(off);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1603,7 +1603,7 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
// LUCENE-510
|
||||
public void testInvalidUTF16() throws Throwable {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)));
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new StringSplitAnalyzer()));
|
||||
Document doc = new Document();
|
||||
|
||||
final int count = utf8Data.length/2;
|
||||
|
|
|
@ -616,7 +616,7 @@ public class TestStressIndexing2 extends LuceneTestCase {
|
|||
}
|
||||
|
||||
for(int i=start;i<end;i++) {
|
||||
int t = nextInt(6);
|
||||
int t = nextInt(5);
|
||||
if (0 == t && i < end-1) {
|
||||
// Make a surrogate pair
|
||||
// High surrogate
|
||||
|
@ -631,13 +631,6 @@ public class TestStressIndexing2 extends LuceneTestCase {
|
|||
buffer[i] = (char) nextInt(0x800, 0xd800);
|
||||
else if (4 == t)
|
||||
buffer[i] = (char) nextInt(0xe000, 0xffff);
|
||||
else if (5 == t) {
|
||||
// Illegal unpaired surrogate
|
||||
if (r.nextBoolean())
|
||||
buffer[i] = (char) nextInt(0xd800, 0xdc00);
|
||||
else
|
||||
buffer[i] = (char) nextInt(0xdc00, 0xe000);
|
||||
}
|
||||
}
|
||||
buffer[end] = ' ';
|
||||
return 1+end;
|
||||
|
|
|
@ -83,6 +83,8 @@ New Features
|
|||
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
|
||||
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
|
||||
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
|
||||
- o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer
|
||||
- o.a.l.util.CharacterUtils -> o.a.l.analysis.util.CharacterUtils
|
||||
|
||||
* SOLR-1057: Add PathHierarchyTokenizer that represents file path hierarchies as synonyms of
|
||||
/something, /something/something, /something/something/else. (Ryan McKinley, Koji Sekiguchi)
|
||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.lucene.analysis.ar;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.analysis.core;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
|
|
@ -22,7 +22,7 @@ import java.io.IOException;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.CharacterUtils;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.analysis.core;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.lucene.analysis.core;
|
|||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.analysis.core;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.analysis.core;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.CharacterUtils;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.analysis.in;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.lucene.analysis.ru;
|
|||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer; // for javadocs
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
|
|
@ -24,7 +24,7 @@ import java.util.Iterator;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.CharacterUtils;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis;
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -20,12 +20,13 @@ package org.apache.lucene.analysis;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.CharacterUtils;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.CharacterUtils.CharacterBuffer;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
|
||||
|
||||
/**
|
||||
* An abstract base class for simple, character-oriented tokenizers.
|
|
@ -1,8 +1,10 @@
|
|||
package org.apache.lucene.util;
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis;
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -20,6 +20,10 @@ package org.apache.lucene.analysis;
|
|||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||
|
||||
|
||||
/**
|
||||
* Testcase for {@link CharTokenizer} subclasses
|
||||
|
@ -42,7 +46,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
}
|
||||
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
|
||||
builder.insert(1023, "\ud801\udc1c");
|
||||
MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true);
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" "));
|
||||
}
|
||||
|
||||
|
@ -59,7 +63,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
builder.append("a");
|
||||
}
|
||||
builder.append("\ud801\udc1cabc");
|
||||
MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true);
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()});
|
||||
}
|
||||
}
|
||||
|
@ -73,7 +77,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
for (int i = 0; i < 255; i++) {
|
||||
builder.append("A");
|
||||
}
|
||||
MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true);
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
|
||||
}
|
||||
|
||||
|
@ -87,7 +91,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
builder.append("A");
|
||||
}
|
||||
builder.append("\ud801\udc1c");
|
||||
MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true);
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.util;
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -21,7 +21,9 @@ import java.io.IOException;
|
|||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.util.CharacterUtils.CharacterBuffer;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
Loading…
Reference in New Issue