LUCENE-3063: factor CharTokenizer/CharacterUtils into analyzers module

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1098871 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-05-03 00:29:47 +00:00
parent e5256e71e2
commit 4455345c6e
20 changed files with 95 additions and 36 deletions

View File

@ -312,6 +312,8 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
- o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer
- o.a.l.util.CharacterUtils -> o.a.l.analysis.util.CharacterUtils
* LUCENE-2514: The option to use a Collator's order (instead of binary order) for
sorting and range queries has been moved to contrib/queries.

View File

@ -20,14 +20,15 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
/**
* Automaton-based tokenizer for testing. Optionally lowercases.
*/
public class MockTokenizer extends CharTokenizer {
public class MockTokenizer extends Tokenizer {
/** Acts Similar to WhitespaceTokenizer */
public static final CharacterRunAutomaton WHITESPACE =
new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").toAutomaton());
@ -45,21 +46,67 @@ public class MockTokenizer extends CharTokenizer {
private final boolean lowerCase;
private int state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
int off = 0;
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
super(LuceneTestCase.TEST_VERSION_CURRENT, factory, input);
super(factory, input);
this.runAutomaton = runAutomaton;
this.lowerCase = lowerCase;
this.state = runAutomaton.getInitialState();
}
public MockTokenizer(Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
super(LuceneTestCase.TEST_VERSION_CURRENT, input);
super(input);
this.runAutomaton = runAutomaton;
this.lowerCase = lowerCase;
this.state = runAutomaton.getInitialState();
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
for (;;) {
int startOffset = off;
int cp = readCodePoint();
if (cp < 0) {
break;
} else if (isTokenChar(cp)) {
int endOffset;
do {
char chars[] = Character.toChars(normalize(cp));
for (int i = 0; i < chars.length; i++)
termAtt.append(chars[i]);
endOffset = off;
cp = readCodePoint();
} while (cp >= 0 && isTokenChar(cp));
offsetAtt.setOffset(startOffset, endOffset);
return true;
}
}
return false;
}
protected int readCodePoint() throws IOException {
int ch = input.read();
if (ch < 0) {
return ch;
} else {
assert !Character.isLowSurrogate((char) ch);
off++;
if (Character.isHighSurrogate((char) ch)) {
int ch2 = input.read();
if (ch2 >= 0) {
off++;
assert Character.isLowSurrogate((char) ch2);
return Character.toCodePoint((char) ch, (char) ch2);
}
}
return ch;
}
}
protected boolean isTokenChar(int c) {
state = runAutomaton.step(state, c);
if (state < 0) {
@ -70,7 +117,6 @@ public class MockTokenizer extends CharTokenizer {
}
}
@Override
protected int normalize(int c) {
return lowerCase ? Character.toLowerCase(c) : c;
}
@ -79,5 +125,12 @@ public class MockTokenizer extends CharTokenizer {
public void reset() throws IOException {
super.reset();
state = runAutomaton.getInitialState();
off = 0;
}
@Override
public void end() throws IOException {
int finalOffset = correctOffset(off);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}

View File

@ -1603,7 +1603,7 @@ public class TestIndexWriter extends LuceneTestCase {
// LUCENE-510
public void testInvalidUTF16() throws Throwable {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)));
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new StringSplitAnalyzer()));
Document doc = new Document();
final int count = utf8Data.length/2;

View File

@ -616,7 +616,7 @@ public class TestStressIndexing2 extends LuceneTestCase {
}
for(int i=start;i<end;i++) {
int t = nextInt(6);
int t = nextInt(5);
if (0 == t && i < end-1) {
// Make a surrogate pair
// High surrogate
@ -631,13 +631,6 @@ public class TestStressIndexing2 extends LuceneTestCase {
buffer[i] = (char) nextInt(0x800, 0xd800);
else if (4 == t)
buffer[i] = (char) nextInt(0xe000, 0xffff);
else if (5 == t) {
// Illegal unpaired surrogate
if (r.nextBoolean())
buffer[i] = (char) nextInt(0xd800, 0xdc00);
else
buffer[i] = (char) nextInt(0xdc00, 0xe000);
}
}
buffer[end] = ' ';
return 1+end;

View File

@ -83,6 +83,8 @@ New Features
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
- o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer
- o.a.l.util.CharacterUtils -> o.a.l.analysis.util.CharacterUtils
* SOLR-1057: Add PathHierarchyTokenizer that represents file path hierarchies as synonyms of
/something, /something/something, /something/something/else. (Ryan McKinley, Koji Sekiguchi)

View File

@ -18,8 +18,8 @@ package org.apache.lucene.analysis.ar;
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;

View File

@ -19,8 +19,8 @@ package org.apache.lucene.analysis.core;
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;

View File

@ -22,7 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.CharacterUtils;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**

View File

@ -19,8 +19,8 @@ package org.apache.lucene.analysis.core;
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;

View File

@ -20,7 +20,7 @@ package org.apache.lucene.analysis.core;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.Version;

View File

@ -19,7 +19,7 @@ package org.apache.lucene.analysis.core;
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.Version;

View File

@ -19,8 +19,8 @@ package org.apache.lucene.analysis.core;
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.CharacterUtils;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**

View File

@ -19,7 +19,7 @@ package org.apache.lucene.analysis.in;
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;

View File

@ -18,8 +18,8 @@ package org.apache.lucene.analysis.ru;
*/
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.Tokenizer; // for javadocs
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
import org.apache.lucene.util.AttributeSource;

View File

@ -24,7 +24,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.util.CharacterUtils;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis;
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -20,12 +20,13 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.CharacterUtils;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.CharacterUtils.CharacterBuffer;
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
/**
* An abstract base class for simple, character-oriented tokenizers.

View File

@ -1,8 +1,10 @@
package org.apache.lucene.util;
package org.apache.lucene.analysis.util;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.util.Version;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis;
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -20,6 +20,10 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
/**
* Testcase for {@link CharTokenizer} subclasses
@ -42,7 +46,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
}
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
builder.insert(1023, "\ud801\udc1c");
MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true);
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" "));
}
@ -59,7 +63,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
builder.append("a");
}
builder.append("\ud801\udc1cabc");
MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true);
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()});
}
}
@ -73,7 +77,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
for (int i = 0; i < 255; i++) {
builder.append("A");
}
MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true);
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
}
@ -87,7 +91,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
builder.append("A");
}
builder.append("\ud801\udc1c");
MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true);
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.util;
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -21,7 +21,9 @@ import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.util.CharacterUtils.CharacterBuffer;
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
import org.junit.Test;
/**