LUCENE-2413: clean up/doc MockAnalyzer, add a MockTokenFilter, which can simulate stopword/lengthfilter/keepfilter,etc

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@944908 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-16 20:56:58 +00:00
parent d23eb64bd7
commit e292af7b12
20 changed files with 249 additions and 66 deletions

View File

@ -19,6 +19,7 @@ package org.apache.lucene.queryParser.ext;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.TestQueryParser;
@ -46,7 +47,7 @@ public class TestExtendableQueryParser extends TestQueryParser {
public QueryParser getParser(Analyzer a, Extensions extensions)
throws Exception {
if (a == null)
a = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
a = new MockAnalyzer(MockTokenizer.SIMPLE, true);
QueryParser qp = extensions == null ? new ExtendableQueryParser(
TEST_VERSION_CURRENT, "field", a) : new ExtendableQueryParser(
TEST_VERSION_CURRENT, "field", a, extensions);

View File

@ -20,7 +20,6 @@ package org.apache.lucene.queryParser.precedence;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
@ -100,7 +99,7 @@ public class TestPrecedenceQueryParser extends LocalizedTestCase {
/** Filters MockTokenizer with StopFilter. */
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new QPTestFilter(new MockTokenizer(reader, MockAnalyzer.SIMPLE, true));
return new QPTestFilter(new MockTokenizer(reader, MockTokenizer.SIMPLE, true));
}
}
@ -130,7 +129,7 @@ public class TestPrecedenceQueryParser extends LocalizedTestCase {
public PrecedenceQueryParser getParser(Analyzer a) throws Exception {
if (a == null)
a = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
a = new MockAnalyzer(MockTokenizer.SIMPLE, true);
PrecedenceQueryParser qp = new PrecedenceQueryParser("field", a);
qp.setDefaultOperator(PrecedenceQueryParser.OR_OPERATOR);
return qp;
@ -175,7 +174,7 @@ public class TestPrecedenceQueryParser extends LocalizedTestCase {
public Query getQueryDOA(String query, Analyzer a)
throws Exception {
if (a == null)
a = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
a = new MockAnalyzer(MockTokenizer.SIMPLE, true);
PrecedenceQueryParser qp = new PrecedenceQueryParser("field", a);
qp.setDefaultOperator(PrecedenceQueryParser.AND_OPERATOR);
return qp.parse(query);

View File

@ -144,7 +144,7 @@ public class TestQPHelper extends LocalizedTestCase {
/** Filters MockTokenizer with StopFilter. */
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new QPTestFilter(new MockTokenizer(reader, MockAnalyzer.SIMPLE, true));
return new QPTestFilter(new MockTokenizer(reader, MockTokenizer.SIMPLE, true));
}
}
@ -204,7 +204,7 @@ public class TestQPHelper extends LocalizedTestCase {
public StandardQueryParser getParser(Analyzer a) throws Exception {
if (a == null)
a = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
a = new MockAnalyzer(MockTokenizer.SIMPLE, true);
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(a);
@ -294,7 +294,7 @@ public class TestQPHelper extends LocalizedTestCase {
public Query getQueryDOA(String query, Analyzer a) throws Exception {
if (a == null)
a = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
a = new MockAnalyzer(MockTokenizer.SIMPLE, true);
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(a);
qp.setDefaultOperator(Operator.AND);

View File

@ -141,7 +141,7 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
/** Filters MockTokenizer with StopFilter. */
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new QPTestFilter(new MockTokenizer(reader, MockAnalyzer.SIMPLE, true));
return new QPTestFilter(new MockTokenizer(reader, MockTokenizer.SIMPLE, true));
}
}
@ -219,7 +219,7 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
public QueryParserWrapper getParser(Analyzer a) throws Exception {
if (a == null)
a = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
a = new MockAnalyzer(MockTokenizer.SIMPLE, true);
QueryParserWrapper qp = new QueryParserWrapper("field", a);
qp.setDefaultOperator(QueryParserWrapper.OR_OPERATOR);
return qp;
@ -304,7 +304,7 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
public Query getQueryDOA(String query, Analyzer a) throws Exception {
if (a == null)
a = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
a = new MockAnalyzer(MockTokenizer.SIMPLE, true);
QueryParserWrapper qp = new QueryParserWrapper("field", a);
qp.setDefaultOperator(QueryParserWrapper.AND_OPERATOR);
return qp.parse(query);
@ -554,7 +554,7 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((TermRangeQuery)getQuery("[ a TO z]", null)).getRewriteMethod());
QueryParserWrapper qp = new QueryParserWrapper("field",
new MockAnalyzer(MockAnalyzer.SIMPLE, true));
new MockAnalyzer(MockTokenizer.SIMPLE, true));
qp.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
assertEquals(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE,((TermRangeQuery)qp.parse("[ a TO z]")).getRewriteMethod());
@ -685,7 +685,7 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
final String monthField = "month";
final String hourField = "hour";
QueryParserWrapper qp = new QueryParserWrapper("field",
new MockAnalyzer(MockAnalyzer.SIMPLE, true));
new MockAnalyzer(MockTokenizer.SIMPLE, true));
// Don't set any date resolution and verify if DateField is used
assertDateRangeQueryEquals(qp, defaultField, startDate, endDate,

View File

@ -21,52 +21,72 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
/**
* Analyzer for testing
*/
public final class MockAnalyzer extends Analyzer {
/** Acts Similar to WhitespaceAnalyzer */
public static final CharacterRunAutomaton WHITESPACE =
new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").toAutomaton());
/** Acts Similar to KeywordAnalyzer.
* TODO: Keyword returns an "empty" token for an empty reader...
*/
public static final CharacterRunAutomaton KEYWORD =
new CharacterRunAutomaton(new RegExp(".*").toAutomaton());
/** Acts like SimpleAnalyzer/LetterTokenizer. */
// the ugly regex below is Unicode 5.2 [:Letter:]
public static final CharacterRunAutomaton SIMPLE =
new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬˮͰ-ʹͶͷͺ-ͽΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԥԱ-Ֆՙա-ևא-תװ-ײء-يٮٯٱ-ۓەۥۦۮۯۺ-ۼۿܐܒ-ܯݍ-ޥޱߊ-ߪߴߵߺࠀ-ࠕࠚࠤࠨऄ-हऽॐक़-ॡॱॲॹ-ॿঅ-ঌএঐও-নপ-রলশ-হঽৎড়ঢ়য়-ৡৰৱਅ-ਊਏਐਓ-ਨਪ-ਰਲਲ਼ਵਸ਼ਸਹਖ਼-ੜਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલળવ-હઽૐૠૡଅ-ଌଏଐଓ-ନପ-ରଲଳଵ-ହଽଡ଼ଢ଼ୟ-ୡୱஃஅ-ஊஎ-ஐஒ-கஙசஜஞடணதந-பம-ஹௐఅ-ఌఎ-ఐఒ-నప-ళవ-హఽౘౙౠౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽೞೠೡഅ-ഌഎ-ഐഒ-നപ-ഹഽൠൡൺ-ൿඅ-ඖක-නඳ-රලව-ෆก-ะาำเ-ๆກຂຄງຈຊຍດ-ທນ-ຟມ-ຣລວສຫອ-ະາຳຽເ-ໄໆໜໝༀཀ-ཇཉ-ཬྈ-ྋက-ဪဿၐ-ၕၚ-ၝၡၥၦၮ-ၰၵ-ႁႎႠ-Ⴥა-ჺჼᄀ-ቈቊ-ቍቐ-ቖቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗៜᠠ-ᡷᢀ-ᢨᢪᢰ-ᣵᤀ-ᤜᥐ-ᥭᥰ-ᥴᦀ-ᦫᧁ-ᧇᨀ-ᨖᨠ-ᩔᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮᮯᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱⁿₐ-ₔℂℇℊ--ℝℤΩℨK--ℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⴀ-ⴥⴰ-ⵥⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ々〆〱-〵〻〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆷㇰ-ㇿ㐀-䶵一-鿋ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪꘫꙀ-ꙟꙢ-ꙮꙿ-ꚗꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋꞌꟻ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺꪀ-ꪯꪱꪵꪶꪹ-ꪽꫀꫂꫛ-ꫝꯀ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-鶴侮-舘並-龎ff-stﬓ-ﬗיִײַ-ﬨשׁ-זּטּ-לּמּנּסּףּפּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA--zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌞𐌰-𐍀𐍂-𐍉𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐠀-𐠅𐠈𐠊-𐠵𐠷𐠸𐠼𐠿-𐡕𐤀-𐤕𐤠-𐤹𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐬀-𐬵𐭀-𐭕𐭠-𐭲𐰀-𐱈𑂃-𑂯𒀀-𒍮𓀀-𓐮𝐀-𝑔𝑖-𝒜𝒞𝒟𝒢𝒥𝒦𝒩-𝒬𝒮-𝒹𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𠀀-𪛖𪜀-𫜴丽-𪘀]+").toAutomaton());
public final class MockAnalyzer extends Analyzer {
private final CharacterRunAutomaton runAutomaton;
private final boolean lowerCase;
public MockAnalyzer(CharacterRunAutomaton runAutomaton, boolean lowerCase) {
private final CharacterRunAutomaton filter;
private final boolean enablePositionIncrements;
/**
* Creates a new MockAnalyzer.
*
* @param runAutomaton DFA describing how tokenization should happen (e.g. [a-zA-Z]+)
* @param lowerCase true if the tokenizer should lowercase terms
* @param filter DFA describing how terms should be filtered (set of stopwords, etc)
* @param enablePositionIncrements true if position increments should reflect filtered terms.
*/
public MockAnalyzer(CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
this.runAutomaton = runAutomaton;
this.lowerCase = lowerCase;
this.filter = filter;
this.enablePositionIncrements = enablePositionIncrements;
}
/**
* Creates a new MockAnalyzer, with no filtering.
*
* @param runAutomaton DFA describing how tokenization should happen (e.g. [a-zA-Z]+)
* @param lowerCase true if the tokenizer should lowercase terms
*/
public MockAnalyzer(CharacterRunAutomaton runAutomaton, boolean lowerCase) {
this(runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false);
}
/**
* Create a Whitespace-lowercasing analyzer with no stopwords removal
*/
public MockAnalyzer() {
this(WHITESPACE, true);
this(MockTokenizer.WHITESPACE, true);
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new MockTokenizer(reader, runAutomaton, lowerCase);
MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase);
return new MockTokenFilter(tokenizer, filter, enablePositionIncrements);
}
private class SavedStreams {
MockTokenizer tokenizer;
MockTokenFilter filter;
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
MockTokenizer t = (MockTokenizer) getPreviousTokenStream();
if (t == null) {
t = new MockTokenizer(reader, runAutomaton, lowerCase);
setPreviousTokenStream(t);
SavedStreams saved = (SavedStreams) getPreviousTokenStream();
if (saved == null) {
saved = new SavedStreams();
saved.tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase);
saved.filter = new MockTokenFilter(saved.tokenizer, filter, enablePositionIncrements);
setPreviousTokenStream(saved);
return saved.filter;
} else {
t.reset(reader);
saved.tokenizer.reset(reader);
return saved.filter;
}
return t;
}
}

View File

@ -0,0 +1,101 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.util.automaton.BasicAutomata.makeEmpty;
import static org.apache.lucene.util.automaton.BasicAutomata.makeString;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* A tokenfilter for testing that removes terms accepted by a DFA.
* <ul>
* <li>Union a list of singletons to act like a stopfilter.
* <li>Use the complement to act like a keepwordfilter
* <li>Use a regex like <code>.{12,}</code> to act like a lengthfilter
* </ul>
*/
public final class MockTokenFilter extends TokenFilter {
/** Empty set of stopwords */
public static final CharacterRunAutomaton EMPTY_STOPSET =
new CharacterRunAutomaton(makeEmpty());
/** Set of common english stopwords */
public static final CharacterRunAutomaton ENGLISH_STOPSET =
new CharacterRunAutomaton(BasicOperations.union(Arrays.asList(
makeString("a"), makeString("an"), makeString("and"), makeString("are"),
makeString("as"), makeString("at"), makeString("be"), makeString("but"),
makeString("by"), makeString("for"), makeString("if"), makeString("in"),
makeString("into"), makeString("is"), makeString("it"), makeString("no"),
makeString("not"), makeString("of"), makeString("on"), makeString("or"),
makeString("such"), makeString("that"), makeString("the"), makeString("their"),
makeString("then"), makeString("there"), makeString("these"), makeString("they"),
makeString("this"), makeString("to"), makeString("was"), makeString("will"),
makeString("with"))));
private final CharacterRunAutomaton filter;
private boolean enablePositionIncrements = false;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
super(input);
this.filter = filter;
this.enablePositionIncrements = enablePositionIncrements;
}
@Override
public boolean incrementToken() throws IOException {
// return the first non-stop word found
int skippedPositions = 0;
while (input.incrementToken()) {
if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
if (enablePositionIncrements) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return false
return false;
}
/**
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* If <code>true</code>, this Filter will preserve
* positions of the incoming tokens (ie, accumulate and
* set position increments of the removed stop tokens).
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
}

View File

@ -22,11 +22,25 @@ import java.io.Reader;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
/**
* Automaton-based tokenizer for testing. Optionally lowercases.
*/
public class MockTokenizer extends CharTokenizer {
/** Acts Similar to WhitespaceTokenizer */
public static final CharacterRunAutomaton WHITESPACE =
new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").toAutomaton());
/** Acts Similar to KeywordTokenizer.
* TODO: Keyword returns an "empty" token for an empty reader...
*/
public static final CharacterRunAutomaton KEYWORD =
new CharacterRunAutomaton(new RegExp(".*").toAutomaton());
/** Acts like LetterTokenizer. */
// the ugly regex below is Unicode 5.2 [:Letter:]
public static final CharacterRunAutomaton SIMPLE =
new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬˮͰ-ʹͶͷͺ-ͽΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԥԱ-Ֆՙա-ևא-תװ-ײء-يٮٯٱ-ۓەۥۦۮۯۺ-ۼۿܐܒ-ܯݍ-ޥޱߊ-ߪߴߵߺࠀ-ࠕࠚࠤࠨऄ-हऽॐक़-ॡॱॲॹ-ॿঅ-ঌএঐও-নপ-রলশ-হঽৎড়ঢ়য়-ৡৰৱਅ-ਊਏਐਓ-ਨਪ-ਰਲਲ਼ਵਸ਼ਸਹਖ਼-ੜਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલળવ-હઽૐૠૡଅ-ଌଏଐଓ-ନପ-ରଲଳଵ-ହଽଡ଼ଢ଼ୟ-ୡୱஃஅ-ஊஎ-ஐஒ-கஙசஜஞடணதந-பம-ஹௐఅ-ఌఎ-ఐఒ-నప-ళవ-హఽౘౙౠౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽೞೠೡഅ-ഌഎ-ഐഒ-നപ-ഹഽൠൡൺ-ൿඅ-ඖක-නඳ-රලව-ෆก-ะาำเ-ๆກຂຄງຈຊຍດ-ທນ-ຟມ-ຣລວສຫອ-ະາຳຽເ-ໄໆໜໝༀཀ-ཇཉ-ཬྈ-ྋက-ဪဿၐ-ၕၚ-ၝၡၥၦၮ-ၰၵ-ႁႎႠ-Ⴥა-ჺჼᄀ-ቈቊ-ቍቐ-ቖቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗៜᠠ-ᡷᢀ-ᢨᢪᢰ-ᣵᤀ-ᤜᥐ-ᥭᥰ-ᥴᦀ-ᦫᧁ-ᧇᨀ-ᨖᨠ-ᩔᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮᮯᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱⁿₐ-ₔℂℇℊ--ℝℤΩℨK--ℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⴀ-ⴥⴰ-ⵥⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ々〆〱-〵〻〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆷㇰ-ㇿ㐀-䶵一-鿋ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪꘫꙀ-ꙟꙢ-ꙮꙿ-ꚗꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋꞌꟻ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺꪀ-ꪯꪱꪵꪶꪹ-ꪽꫀꫂꫛ-ꫝꯀ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-鶴侮-舘並-龎ff-stﬓ-ﬗיִײַ-ﬨשׁ-זּטּ-לּמּנּסּףּפּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA--zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌞𐌰-𐍀𐍂-𐍉𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐠀-𐠅𐠈𐠊-𐠵𐠷𐠸𐠼𐠿-𐡕𐤀-𐤕𐤠-𐤹𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐬀-𐬵𐭀-𐭕𐭠-𐭲𐰀-𐱈𑂃-𑂯𒀀-𒍮𓀀-𓐮𝐀-𝑔𝑖-𝒜𝒞𝒟𝒢𝒥𝒦𝒩-𝒬𝒮-𝒹𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𠀀-𪛖𪜀-𫜴丽-𪘀]+").toAutomaton());
private final CharacterRunAutomaton runAutomaton;
private final boolean lowerCase;
private int state;

View File

@ -1,5 +1,13 @@
package org.apache.lucene.analysis;
import java.util.Arrays;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -19,6 +27,7 @@ package org.apache.lucene.analysis;
public class TestMockAnalyzer extends BaseTokenStreamTestCase {
/** Test a configuration that behaves a lot like WhitespaceAnalyzer */
public void testWhitespace() throws Exception {
Analyzer a = new MockAnalyzer();
assertAnalyzesTo(a, "A bc defg hiJklmn opqrstuv wxy z ",
@ -29,8 +38,9 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
new String[] { "break", "on", "whitespace" });
}
/** Test a configuration that behaves a lot like SimpleAnalyzer */
public void testSimple() throws Exception {
Analyzer a = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
Analyzer a = new MockAnalyzer(MockTokenizer.SIMPLE, true);
assertAnalyzesTo(a, "a-bc123 defg+hijklmn567opqrstuv78wxy_z ",
new String[] { "a", "bc", "defg", "hijklmn", "opqrstuv", "wxy", "z" });
assertAnalyzesToReuse(a, "aba4cadaba-Shazam",
@ -39,8 +49,9 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
new String[] { "break", "on", "letters" });
}
/** Test a configuration that behaves a lot like KeywordAnalyzer */
public void testKeyword() throws Exception {
Analyzer a = new MockAnalyzer(MockAnalyzer.KEYWORD, false);
Analyzer a = new MockAnalyzer(MockTokenizer.KEYWORD, false);
assertAnalyzesTo(a, "a-bc123 defg+hijklmn567opqrstuv78wxy_z ",
new String[] { "a-bc123 defg+hijklmn567opqrstuv78wxy_z " });
assertAnalyzesToReuse(a, "aba4cadaba-Shazam",
@ -48,4 +59,40 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesToReuse(a, "break+on/Nothing",
new String[] { "break+on/Nothing" });
}
/** Test a configuration that behaves a lot like StopAnalyzer */
public void testStop() throws Exception {
Analyzer a = new MockAnalyzer(MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
assertAnalyzesTo(a, "the quick brown a fox",
new String[] { "quick", "brown", "fox" },
new int[] { 2, 1, 2 });
// disable positions
a = new MockAnalyzer(MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false);
assertAnalyzesTo(a, "the quick brown a fox",
new String[] { "quick", "brown", "fox" },
new int[] { 1, 1, 1 });
}
/** Test a configuration that behaves a lot like KeepWordFilter */
public void testKeep() throws Exception {
CharacterRunAutomaton keepWords =
new CharacterRunAutomaton(
BasicOperations.complement(
Automaton.union(
Arrays.asList(BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar")))));
Analyzer a = new MockAnalyzer(MockTokenizer.SIMPLE, true, keepWords, true);
assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
new String[] { "foo", "bar", "bar", "foo" },
new int[] { 2, 2, 1, 2 });
}
/** Test a configuration that behaves a lot like LengthFilter */
public void testLength() throws Exception {
CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton());
Analyzer a = new MockAnalyzer(MockTokenizer.WHITESPACE, true, length5, true);
assertAnalyzesTo(a, "ok toolong fine notfine",
new String[] { "ok", "fine" },
new int[] { 1, 2 });
}
}

View File

@ -101,7 +101,7 @@ public class TestMultiLevelSkipList extends LuceneTestCase {
private static class PayloadAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new PayloadFilter(new MockTokenizer(reader, MockAnalyzer.WHITESPACE, true));
return new PayloadFilter(new MockTokenizer(reader, MockTokenizer.WHITESPACE, true));
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.index;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MockRAMDirectory;
@ -35,7 +36,7 @@ import java.io.File;
public class TestThreadedOptimize extends LuceneTestCase {
private static final Analyzer ANALYZER = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
private static final Analyzer ANALYZER = new MockAnalyzer(MockTokenizer.SIMPLE, true);
private final static int NUM_THREADS = 3;
//private final static int NUM_THREADS = 5;

View File

@ -128,7 +128,7 @@ public class TestQueryParser extends LocalizedTestCase {
/** Filters MockTokenizer with StopFilter. */
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new QPTestFilter(new MockTokenizer(reader, MockAnalyzer.SIMPLE, true));
return new QPTestFilter(new MockTokenizer(reader, MockTokenizer.SIMPLE, true));
}
}
@ -158,7 +158,7 @@ public class TestQueryParser extends LocalizedTestCase {
public QueryParser getParser(Analyzer a) throws Exception {
if (a == null)
a = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
a = new MockAnalyzer(MockTokenizer.SIMPLE, true);
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", a);
qp.setDefaultOperator(QueryParser.OR_OPERATOR);
return qp;
@ -228,7 +228,7 @@ public class TestQueryParser extends LocalizedTestCase {
public Query getQueryDOA(String query, Analyzer a)
throws Exception {
if (a == null)
a = new MockAnalyzer(MockAnalyzer.SIMPLE, true);
a = new MockAnalyzer(MockTokenizer.SIMPLE, true);
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", a);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
return qp.parse(query);
@ -456,7 +456,7 @@ public class TestQueryParser extends LocalizedTestCase {
assertQueryEquals("[ a TO z]", null, "[a TO z]");
assertEquals(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT, ((TermRangeQuery)getQuery("[ a TO z]", null)).getRewriteMethod());
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(MockAnalyzer.SIMPLE, true));
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(MockTokenizer.SIMPLE, true));
qp.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
assertEquals(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE,((TermRangeQuery)qp.parse("[ a TO z]")).getRewriteMethod());
@ -579,7 +579,7 @@ public class TestQueryParser extends LocalizedTestCase {
final String defaultField = "default";
final String monthField = "month";
final String hourField = "hour";
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(MockAnalyzer.SIMPLE, true));
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(MockTokenizer.SIMPLE, true));
// Don't set any date resolution and verify if DateField is used
assertDateRangeQueryEquals(qp, defaultField, startDate, endDate,

View File

@ -22,6 +22,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
@ -77,7 +78,7 @@ public class TestFuzzyQuery2 extends LuceneTestCase {
int terms = (int) Math.pow(2, bits);
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new MockAnalyzer(MockAnalyzer.KEYWORD, false),
IndexWriter writer = new IndexWriter(dir, new MockAnalyzer(MockTokenizer.KEYWORD, false),
IndexWriter.MaxFieldLength.UNLIMITED);
Document doc = new Document();

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
@ -617,7 +618,7 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
/* build an index */
RAMDirectory farsiIndex = new RAMDirectory();
IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer(MockAnalyzer.SIMPLE, true)));
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.SIMPLE, true)));
Document doc = new Document();
doc.add(new Field("content", "\u0633\u0627\u0628", Field.Store.YES,
Field.Index.NOT_ANALYZED));
@ -657,7 +658,7 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
/* build an index */
RAMDirectory danishIndex = new RAMDirectory();
IndexWriter writer = new IndexWriter(danishIndex, new IndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer(MockAnalyzer.SIMPLE, true)));
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.SIMPLE, true)));
// Danish collation orders the words below in the given order
// (example taken from TestSort.testInternationalSort() ).

View File

@ -334,7 +334,7 @@ final class TestPayloadAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new MockTokenizer(reader, MockAnalyzer.WHITESPACE, true);
TokenStream result = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
return new PayloadFilter(result, fieldName);
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -44,7 +45,7 @@ public class TestTermVectors extends LuceneTestCase {
protected void setUp() throws Exception {
super.setUp();
IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer(MockAnalyzer.SIMPLE, true)));
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.SIMPLE, true)));
//writer.setUseCompoundFile(true);
//writer.infoStream = System.out;
for (int i = 0; i < 1000; i++) {
@ -96,7 +97,7 @@ public class TestTermVectors extends LuceneTestCase {
public void testTermVectorsFieldOrder() throws IOException {
Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer(MockAnalyzer.SIMPLE, true)));
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.SIMPLE, true)));
Document doc = new Document();
doc.add(new Field("c", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("a", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
@ -236,7 +237,7 @@ public class TestTermVectors extends LuceneTestCase {
try {
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
TEST_VERSION_CURRENT,
new MockAnalyzer(MockAnalyzer.SIMPLE, true))
new MockAnalyzer(MockTokenizer.SIMPLE, true))
.setOpenMode(OpenMode.CREATE));
writer.addDocument(testDoc1);
writer.addDocument(testDoc2);
@ -352,7 +353,7 @@ public class TestTermVectors extends LuceneTestCase {
// Test only a few docs having vectors
public void testRareVectors() throws IOException {
IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer(MockAnalyzer.SIMPLE, true))
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.SIMPLE, true))
.setOpenMode(OpenMode.CREATE));
for (int i = 0; i < 100; i++) {
Document doc = new Document();
@ -386,7 +387,7 @@ public class TestTermVectors extends LuceneTestCase {
public void testMixedVectrosVectors() throws IOException {
IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(
TEST_VERSION_CURRENT,
new MockAnalyzer(MockAnalyzer.SIMPLE, true)).setOpenMode(OpenMode.CREATE));
new MockAnalyzer(MockTokenizer.SIMPLE, true)).setOpenMode(OpenMode.CREATE));
Document doc = new Document();
doc.add(new Field("field", "one",
Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));

View File

@ -52,7 +52,7 @@ public class PayloadHelper {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new MockTokenizer(reader, MockAnalyzer.SIMPLE, true);
TokenStream result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
result = new PayloadFilter(result, fieldName);
return result;
}

View File

@ -20,7 +20,6 @@ import java.io.Reader;
import java.util.Collection;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -59,7 +58,7 @@ public class TestPayloadNearQuery extends LuceneTestCase {
private class PayloadAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new MockTokenizer(reader, MockAnalyzer.SIMPLE, true);
TokenStream result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
result = new PayloadFilter(result, fieldName);
return result;
}

View File

@ -30,10 +30,8 @@ import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.index.IndexWriterConfig;
@ -69,7 +67,7 @@ public class TestPayloadTermQuery extends LuceneTestCase {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new MockTokenizer(reader, MockAnalyzer.SIMPLE, true);
TokenStream result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
result = new PayloadFilter(result, fieldName);
return result;
}

View File

@ -20,7 +20,7 @@ package org.apache.lucene.search.spans;
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
@ -58,7 +58,7 @@ public class TestBasics extends LuceneTestCase {
super.setUp();
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer(MockAnalyzer.SIMPLE, true)));
TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.SIMPLE, true)));
//writer.infoStream = System.out;
for (int i = 0; i < 1000; i++) {
Document doc = new Document();

View File

@ -467,7 +467,7 @@ public class TestPayloadSpans extends LuceneTestCase {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new MockTokenizer(reader, MockAnalyzer.SIMPLE, true);
TokenStream result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
result = new PayloadFilter(result, fieldName);
return result;
}
@ -519,7 +519,7 @@ public class TestPayloadSpans extends LuceneTestCase {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new MockTokenizer(reader, MockAnalyzer.SIMPLE, true);
TokenStream result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
result = new PayloadFilter(result, fieldName);
return result;
}