mirror of https://github.com/apache/lucene.git
LUCENE-2413: consolidate remaining solr tokenstreams into modules/analysis
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@957162 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
653c7c160b
commit
8f71031ac8
|
@ -27,11 +27,14 @@ New Features
|
||||||
with text contained in the required words (inverse of StopFilter).
|
with text contained in the required words (inverse of StopFilter).
|
||||||
- o.a.l.analysis.miscellaneous.HyphenatedWordsFilter: A TokenFilter that puts
|
- o.a.l.analysis.miscellaneous.HyphenatedWordsFilter: A TokenFilter that puts
|
||||||
hyphenated words broken into two lines back together.
|
hyphenated words broken into two lines back together.
|
||||||
|
- o.a.l.analysis.miscellaneous.CapitalizationFilter: A TokenFilter that applies
|
||||||
|
capitalization rules to tokens.
|
||||||
- o.a.l.analysis.pattern: Package for pattern-based analysis, containing a
|
- o.a.l.analysis.pattern: Package for pattern-based analysis, containing a
|
||||||
CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
|
CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
|
||||||
- o.a.l.analysis.synonym.SynonymFilter: A synonym filter that supports multi-word
|
- o.a.l.analysis.synonym.SynonymFilter: A synonym filter that supports multi-word
|
||||||
synonyms.
|
synonyms.
|
||||||
(... in progress)
|
- o.a.l.analysis.phonetic: Package for phonetic search, containing various
|
||||||
|
phonetic encoders such as Double Metaphone.
|
||||||
|
|
||||||
* LUCENE-2413: Consolidated all Lucene analyzers into common.
|
* LUCENE-2413: Consolidated all Lucene analyzers into common.
|
||||||
- o.a.l.analysis.KeywordAnalyzer -> o.a.l.analysis.core.KeywordAnalyzer
|
- o.a.l.analysis.KeywordAnalyzer -> o.a.l.analysis.core.KeywordAnalyzer
|
||||||
|
@ -60,7 +63,6 @@ New Features
|
||||||
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
|
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
|
||||||
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
|
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
|
||||||
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
|
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
|
||||||
... (in progress)
|
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,10 @@ Copyright 2006 The Apache Software Foundation
|
||||||
This product includes software developed by
|
This product includes software developed by
|
||||||
The Apache Software Foundation (http://www.apache.org/).
|
The Apache Software Foundation (http://www.apache.org/).
|
||||||
|
|
||||||
|
Includes software from other Apache Software Foundation projects,
|
||||||
|
including, but not limited to:
|
||||||
|
- Apache Commons
|
||||||
|
|
||||||
The snowball stemmers in
|
The snowball stemmers in
|
||||||
common/src/java/net/sf/snowball
|
common/src/java/net/sf/snowball
|
||||||
were developed by Martin Porter and Richard Boulton.
|
were developed by Martin Porter and Richard Boulton.
|
||||||
|
|
|
@ -20,7 +20,12 @@ lucene-analyzers-common-XX.jar
|
||||||
lucene-analyzers-icu-XX.jar
|
lucene-analyzers-icu-XX.jar
|
||||||
An add-on analysis library that provides improved Unicode support via
|
An add-on analysis library that provides improved Unicode support via
|
||||||
International Components for Unicode (ICU). Note: this module depends on
|
International Components for Unicode (ICU). Note: this module depends on
|
||||||
the ICU4j jar file (version > 4.4.0)
|
the ICU4j jar file (version >= 4.4.0)
|
||||||
|
|
||||||
|
lucene-analyzers-phonetic-XX.jar
|
||||||
|
An add-on analysis library that provides phonetic encoders via Apache
|
||||||
|
Commons-Codec. Note: this module depends on the commons-codec jar
|
||||||
|
file (version >= 1.4)
|
||||||
|
|
||||||
lucene-analyzers-smartcn-XX.jar
|
lucene-analyzers-smartcn-XX.jar
|
||||||
An add-on analysis library that provides word segmentation for Simplified
|
An add-on analysis library that provides word segmentation for Simplified
|
||||||
|
@ -32,12 +37,14 @@ lucene-analyzers-stempel-XX.jar
|
||||||
|
|
||||||
common/src/java
|
common/src/java
|
||||||
icu/src/java
|
icu/src/java
|
||||||
|
phonetic/src/java
|
||||||
smartcn/src/java
|
smartcn/src/java
|
||||||
stempel/src/java
|
stempel/src/java
|
||||||
The source code for the four libraries.
|
The source code for the ffve libraries.
|
||||||
|
|
||||||
common/src/test
|
common/src/test
|
||||||
icu/src/test
|
icu/src/test
|
||||||
|
phonetic/src/test
|
||||||
smartcn/src/test
|
smartcn/src/test
|
||||||
stempel/src/test
|
stempel/src/test
|
||||||
Unit tests for the four libraries.
|
Unit tests for the five libraries.
|
||||||
|
|
|
@ -35,6 +35,10 @@
|
||||||
<ant dir="icu" />
|
<ant dir="icu" />
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<target name="phonetic">
|
||||||
|
<ant dir="phonetic" />
|
||||||
|
</target>
|
||||||
|
|
||||||
<target name="smartcn">
|
<target name="smartcn">
|
||||||
<ant dir="smartcn" />
|
<ant dir="smartcn" />
|
||||||
</target>
|
</target>
|
||||||
|
@ -44,29 +48,33 @@
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="default" depends="compile"/>
|
<target name="default" depends="compile"/>
|
||||||
<target name="compile" depends="common,icu,smartcn,stempel" />
|
<target name="compile" depends="common,icu,phonetic,smartcn,stempel" />
|
||||||
|
|
||||||
<target name="clean">
|
<target name="clean">
|
||||||
<ant dir="common" target="clean" />
|
<ant dir="common" target="clean" />
|
||||||
<ant dir="icu" target="clean" />
|
<ant dir="icu" target="clean" />
|
||||||
|
<ant dir="phonetic" target="clean" />
|
||||||
<ant dir="smartcn" target="clean" />
|
<ant dir="smartcn" target="clean" />
|
||||||
<ant dir="stempel" target="clean" />
|
<ant dir="stempel" target="clean" />
|
||||||
</target>
|
</target>
|
||||||
<target name="compile-core">
|
<target name="compile-core">
|
||||||
<ant dir="common" target="compile-core" />
|
<ant dir="common" target="compile-core" />
|
||||||
<ant dir="icu" target="compile-core" />
|
<ant dir="icu" target="compile-core" />
|
||||||
|
<ant dir="phonetic" target="compile-core" />
|
||||||
<ant dir="smartcn" target="compile-core" />
|
<ant dir="smartcn" target="compile-core" />
|
||||||
<ant dir="stempel" target="compile-core" />
|
<ant dir="stempel" target="compile-core" />
|
||||||
</target>
|
</target>
|
||||||
<target name="compile-test">
|
<target name="compile-test">
|
||||||
<ant dir="common" target="compile-test" />
|
<ant dir="common" target="compile-test" />
|
||||||
<ant dir="icu" target="compile-test" />
|
<ant dir="icu" target="compile-test" />
|
||||||
|
<ant dir="phonetic" target="compile-test" />
|
||||||
<ant dir="smartcn" target="compile-test" />
|
<ant dir="smartcn" target="compile-test" />
|
||||||
<ant dir="stempel" target="compile-test" />
|
<ant dir="stempel" target="compile-test" />
|
||||||
</target>
|
</target>
|
||||||
<target name="test">
|
<target name="test">
|
||||||
<ant dir="common" target="test" />
|
<ant dir="common" target="test" />
|
||||||
<ant dir="icu" target="test" />
|
<ant dir="icu" target="test" />
|
||||||
|
<ant dir="phonetic" target="test" />
|
||||||
<ant dir="smartcn" target="test" />
|
<ant dir="smartcn" target="test" />
|
||||||
<ant dir="stempel" target="test" />
|
<ant dir="stempel" target="test" />
|
||||||
</target>
|
</target>
|
||||||
|
@ -76,6 +84,7 @@
|
||||||
<target name="dist-maven" depends="default">
|
<target name="dist-maven" depends="default">
|
||||||
<ant dir="common" target="dist-maven" />
|
<ant dir="common" target="dist-maven" />
|
||||||
<ant dir="icu" target="dist-maven" />
|
<ant dir="icu" target="dist-maven" />
|
||||||
|
<ant dir="phonetic" target="dist-maven" />
|
||||||
<ant dir="smartcn" target="dist-maven" />
|
<ant dir="smartcn" target="dist-maven" />
|
||||||
<ant dir="stempel" target="dist-maven" />
|
<ant dir="stempel" target="dist-maven" />
|
||||||
</target>
|
</target>
|
||||||
|
@ -83,6 +92,7 @@
|
||||||
<target name="javadocs">
|
<target name="javadocs">
|
||||||
<ant dir="common" target="javadocs" />
|
<ant dir="common" target="javadocs" />
|
||||||
<ant dir="icu" target="javadocs" />
|
<ant dir="icu" target="javadocs" />
|
||||||
|
<ant dir="phonetic" target="javadocs" />
|
||||||
<ant dir="smartcn" target="javadocs" />
|
<ant dir="smartcn" target="javadocs" />
|
||||||
<ant dir="stempel" target="javadocs" />
|
<ant dir="stempel" target="javadocs" />
|
||||||
</target>
|
</target>
|
||||||
|
@ -90,6 +100,7 @@
|
||||||
<target name="javadocs-index.html">
|
<target name="javadocs-index.html">
|
||||||
<ant dir="common" target="javadocs-index.html" />
|
<ant dir="common" target="javadocs-index.html" />
|
||||||
<ant dir="icu" target="javadocs-index.html" />
|
<ant dir="icu" target="javadocs-index.html" />
|
||||||
|
<ant dir="phonetic" target="javadocs-index.html" />
|
||||||
<ant dir="smartcn" target="javadocs-index.html" />
|
<ant dir="smartcn" target="javadocs-index.html" />
|
||||||
<ant dir="stempel" target="javadocs-index.html" />
|
<ant dir="stempel" target="javadocs-index.html" />
|
||||||
</target>
|
</target>
|
||||||
|
|
|
@ -0,0 +1,181 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A filter to apply normal capitalization rules to Tokens. It will make the first letter
|
||||||
|
* capital and the rest lower case.
|
||||||
|
* <p/>
|
||||||
|
* This filter is particularly useful to build nice looking facet parameters. This filter
|
||||||
|
* is not appropriate if you intend to use a prefix query.
|
||||||
|
*/
|
||||||
|
public final class CapitalizationFilter extends TokenFilter {
|
||||||
|
public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE;
|
||||||
|
public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
|
||||||
|
|
||||||
|
private final boolean onlyFirstWord;
|
||||||
|
private final CharArraySet keep;
|
||||||
|
private final boolean forceFirstLetter;
|
||||||
|
private final Collection<char[]> okPrefix;
|
||||||
|
|
||||||
|
private final int minWordLength;
|
||||||
|
private final int maxWordCount;
|
||||||
|
private final int maxTokenLength;
|
||||||
|
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a CapitalizationFilter with the default parameters.
|
||||||
|
* <p>
|
||||||
|
* Calls {@link #CapitalizationFilter(TokenStream, boolean, CharArraySet, boolean, Collection, int, int, int)
|
||||||
|
* CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH)}
|
||||||
|
*/
|
||||||
|
public CapitalizationFilter(TokenStream in) {
|
||||||
|
this(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a CapitalizationFilter with the specified parameters.
|
||||||
|
* @param in input tokenstream
|
||||||
|
* @param onlyFirstWord should each word be capitalized or all of the words?
|
||||||
|
* @param keep a keep word list. Each word that should be kept separated by whitespace.
|
||||||
|
* @param forceFirstLetter Force the first letter to be capitalized even if it is in the keep list.
|
||||||
|
* @param okPrefix do not change word capitalization if a word begins with something in this list.
|
||||||
|
* @param minWordLength how long the word needs to be to get capitalization applied. If the
|
||||||
|
* minWordLength is 3, "and" > "And" but "or" stays "or".
|
||||||
|
* @param maxWordCount if the token contains more then maxWordCount words, the capitalization is
|
||||||
|
* assumed to be correct.
|
||||||
|
* @param maxTokenLength ???
|
||||||
|
*/
|
||||||
|
public CapitalizationFilter(TokenStream in, boolean onlyFirstWord, CharArraySet keep,
|
||||||
|
boolean forceFirstLetter, Collection<char[]> okPrefix, int minWordLength,
|
||||||
|
int maxWordCount, int maxTokenLength) {
|
||||||
|
super(in);
|
||||||
|
this.onlyFirstWord = onlyFirstWord;
|
||||||
|
this.keep = keep;
|
||||||
|
this.forceFirstLetter = forceFirstLetter;
|
||||||
|
this.okPrefix = okPrefix;
|
||||||
|
this.minWordLength = minWordLength;
|
||||||
|
this.maxWordCount = maxWordCount;
|
||||||
|
this.maxTokenLength = maxTokenLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (!input.incrementToken()) return false;
|
||||||
|
|
||||||
|
char[] termBuffer = termAtt.buffer();
|
||||||
|
int termBufferLength = termAtt.length();
|
||||||
|
char[] backup = null;
|
||||||
|
|
||||||
|
if (maxWordCount < DEFAULT_MAX_WORD_COUNT) {
|
||||||
|
//make a backup in case we exceed the word count
|
||||||
|
backup = new char[termBufferLength];
|
||||||
|
System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (termBufferLength < maxTokenLength) {
|
||||||
|
int wordCount = 0;
|
||||||
|
|
||||||
|
int lastWordStart = 0;
|
||||||
|
for (int i = 0; i < termBufferLength; i++) {
|
||||||
|
char c = termBuffer[i];
|
||||||
|
if (c <= ' ' || c == '.') {
|
||||||
|
int len = i - lastWordStart;
|
||||||
|
if (len > 0) {
|
||||||
|
processWord(termBuffer, lastWordStart, len, wordCount++);
|
||||||
|
lastWordStart = i + 1;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// process the last word
|
||||||
|
if (lastWordStart < termBufferLength) {
|
||||||
|
processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wordCount > maxWordCount) {
|
||||||
|
termAtt.copyBuffer(backup, 0, termBufferLength);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processWord(char[] buffer, int offset, int length, int wordCount) {
|
||||||
|
if (length < 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (onlyFirstWord && wordCount > 0) {
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
|
||||||
|
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (keep != null && keep.contains(buffer, offset, length)) {
|
||||||
|
if (wordCount == 0 && forceFirstLetter) {
|
||||||
|
buffer[offset] = Character.toUpperCase(buffer[offset]);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (length < minWordLength) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (okPrefix != null) {
|
||||||
|
for (char[] prefix : okPrefix) {
|
||||||
|
if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix
|
||||||
|
boolean match = true;
|
||||||
|
for (int i = 0; i < prefix.length; i++) {
|
||||||
|
if (prefix[i] != buffer[offset + i]) {
|
||||||
|
match = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (match == true) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We know it has at least one character
|
||||||
|
/*char[] chars = w.toCharArray();
|
||||||
|
StringBuilder word = new StringBuilder( w.length() );
|
||||||
|
word.append( Character.toUpperCase( chars[0] ) );*/
|
||||||
|
buffer[offset] = Character.toUpperCase(buffer[offset]);
|
||||||
|
|
||||||
|
for (int i = 1; i < length; i++) {
|
||||||
|
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
|
||||||
|
}
|
||||||
|
//return word.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,121 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*;
|
||||||
|
|
||||||
|
/** Tests {@link CapitalizationFilter} */
|
||||||
|
public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
|
||||||
|
public void testCapitalization() throws Exception {
|
||||||
|
CharArraySet keep = new CharArraySet(TEST_VERSION_CURRENT,
|
||||||
|
Arrays.asList("and", "the", "it", "BIG"), false);
|
||||||
|
|
||||||
|
assertCapitalizesTo("kiTTEN", new String[] { "Kitten" },
|
||||||
|
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
assertCapitalizesTo("and", new String[] { "And" },
|
||||||
|
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
assertCapitalizesTo("AnD", new String[] { "And" },
|
||||||
|
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
//first is not forced, but it's not a keep word, either
|
||||||
|
assertCapitalizesTo("AnD", new String[] { "And" },
|
||||||
|
true, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
assertCapitalizesTo("big", new String[] { "Big" },
|
||||||
|
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
assertCapitalizesTo("BIG", new String[] { "BIG" },
|
||||||
|
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
assertCapitalizesToKeyword("Hello thEre my Name is Ryan", "Hello there my name is ryan",
|
||||||
|
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
// now each token
|
||||||
|
assertCapitalizesTo("Hello thEre my Name is Ryan",
|
||||||
|
new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" },
|
||||||
|
false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
// now only the long words
|
||||||
|
assertCapitalizesTo("Hello thEre my Name is Ryan",
|
||||||
|
new String[] { "Hello", "There", "my", "Name", "is", "Ryan" },
|
||||||
|
false, keep, true, null, 3, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
// without prefix
|
||||||
|
assertCapitalizesTo("McKinley",
|
||||||
|
new String[] { "Mckinley" },
|
||||||
|
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
// Now try some prefixes
|
||||||
|
List<char[]> okPrefix = new ArrayList<char[]>();
|
||||||
|
okPrefix.add("McK".toCharArray());
|
||||||
|
|
||||||
|
assertCapitalizesTo("McKinley",
|
||||||
|
new String[] { "McKinley" },
|
||||||
|
true, keep, true, okPrefix, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
// now try some stuff with numbers
|
||||||
|
assertCapitalizesTo("1st 2nd third",
|
||||||
|
new String[] { "1st", "2nd", "Third" },
|
||||||
|
false, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
assertCapitalizesToKeyword("the The the", "The The the",
|
||||||
|
false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void assertCapitalizesTo(Tokenizer tokenizer, String expected[],
|
||||||
|
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
|
||||||
|
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
|
||||||
|
int maxTokenLength) throws IOException {
|
||||||
|
CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep,
|
||||||
|
forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
|
||||||
|
assertTokenStreamContents(filter, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void assertCapitalizesTo(String input, String expected[],
|
||||||
|
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
|
||||||
|
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
|
||||||
|
int maxTokenLength) throws IOException {
|
||||||
|
assertCapitalizesTo(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||||
|
expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength,
|
||||||
|
maxWordCount, maxTokenLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void assertCapitalizesToKeyword(String input, String expected,
|
||||||
|
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
|
||||||
|
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
|
||||||
|
int maxTokenLength) throws IOException {
|
||||||
|
assertCapitalizesTo(new KeywordTokenizer(new StringReader(input)),
|
||||||
|
new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
|
||||||
|
minWordLength, maxWordCount, maxTokenLength);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,63 @@
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project name="analyzers-phonetic" default="default">
|
||||||
|
|
||||||
|
<description>
|
||||||
|
Provides phonetic encoding support via Apache Commons Codec.
|
||||||
|
</description>
|
||||||
|
|
||||||
|
<property name="build.dir" location="../build/phonetic" />
|
||||||
|
<property name="dist.dir" location="../dist/phonetic" />
|
||||||
|
|
||||||
|
<path id="additional.dependencies">
|
||||||
|
<fileset dir="lib" includes="commons-codec-*.jar"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<pathconvert property="project.classpath"
|
||||||
|
targetos="unix"
|
||||||
|
refid="additional.dependencies"
|
||||||
|
/>
|
||||||
|
|
||||||
|
<import file="../../../lucene/contrib/contrib-build.xml"/>
|
||||||
|
|
||||||
|
<module-uptodate name="analysis/common" jarfile="../build/common/lucene-analyzers-common-${version}.jar"
|
||||||
|
property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
|
||||||
|
|
||||||
|
<path id="classpath">
|
||||||
|
<pathelement path="${analyzers-common.jar}"/>
|
||||||
|
<path refid="base.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="test.classpath">
|
||||||
|
<pathelement path="${analyzers-common.jar}"/>
|
||||||
|
<path refid="classpath"/>
|
||||||
|
<pathelement location="../../../lucene/build/classes/test/"/>
|
||||||
|
<pathelement location="../build/common/classes/test/"/>
|
||||||
|
<path refid="junit-path"/>
|
||||||
|
<pathelement location="${build.dir}/classes/java"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<target name="compile-core" depends="build-analyzers-common, common.compile-core" />
|
||||||
|
|
||||||
|
<target name="build-analyzers-common" unless="analyzers-common.uptodate">
|
||||||
|
<echo>phonetic building dependency ${analyzers-common.jar}</echo>
|
||||||
|
<ant antfile="../common/build.xml" target="default" inheritall="false" dir="../common" />
|
||||||
|
</target>
|
||||||
|
</project>
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[458d432da88b0efeab640c229903fb5aad274044] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,46 @@
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||||
|
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
or more contributor license agreements. See the NOTICE file
|
||||||
|
distributed with this work for additional information
|
||||||
|
regarding copyright ownership. The ASF licenses this file
|
||||||
|
to you under the Apache License, Version 2.0 (the
|
||||||
|
"License"); you may not use this file except in compliance
|
||||||
|
with the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing,
|
||||||
|
software distributed under the License is distributed on an
|
||||||
|
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations
|
||||||
|
under the License.
|
||||||
|
-->
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<parent>
|
||||||
|
<groupId>org.apache.lucene</groupId>
|
||||||
|
<artifactId>lucene-contrib</artifactId>
|
||||||
|
<version>@version@</version>
|
||||||
|
</parent>
|
||||||
|
<groupId>org.apache.lucene</groupId>
|
||||||
|
<artifactId>lucene-phonetic</artifactId>
|
||||||
|
<name>
|
||||||
|
Lucene Phonetic Filters
|
||||||
|
</name>
|
||||||
|
<version>@version@</version>
|
||||||
|
<description>
|
||||||
|
Provides phonetic encoding via Commons Codec.
|
||||||
|
</description>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>codec</artifactId>
|
||||||
|
<version>${codec-version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.analysis;
|
package org.apache.lucene.analysis.phonetic;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
@ -35,7 +35,7 @@ public final class DoubleMetaphoneFilter extends TokenFilter {
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
|
public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
|
||||||
super(input);
|
super(input);
|
||||||
this.encoder.setMaxCodeLen(maxCodeLength);
|
this.encoder.setMaxCodeLen(maxCodeLength);
|
||||||
this.inject = inject;
|
this.inject = inject;
|
|
@ -15,7 +15,7 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.lucene.analysis.phonetic;
|
||||||
|
|
||||||
import org.apache.commons.codec.Encoder;
|
import org.apache.commons.codec.Encoder;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
@ -28,23 +28,19 @@ import java.io.IOException;
|
||||||
/**
|
/**
|
||||||
* Create tokens for phonetic matches. See:
|
* Create tokens for phonetic matches. See:
|
||||||
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
|
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
*/
|
*/
|
||||||
public final class PhoneticFilter extends TokenFilter
|
public final class PhoneticFilter extends TokenFilter
|
||||||
{
|
{
|
||||||
protected boolean inject = true;
|
protected boolean inject = true;
|
||||||
protected Encoder encoder = null;
|
protected Encoder encoder = null;
|
||||||
protected String name = null;
|
|
||||||
|
|
||||||
protected State save = null;
|
protected State save = null;
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
|
public PhoneticFilter(TokenStream in, Encoder encoder, boolean inject) {
|
||||||
super(in);
|
super(in);
|
||||||
this.encoder = encoder;
|
this.encoder = encoder;
|
||||||
this.name = name;
|
|
||||||
this.inject = inject;
|
this.inject = inject;
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,52 +14,53 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.analysis;
|
package org.apache.lucene.analysis.phonetic;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
|
public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testSize4FalseInject() throws Exception {
|
public void testSize4FalseInject() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||||
assertTokenStreamContents(filter, new String[] { "ANTR" });
|
assertTokenStreamContents(filter, new String[] { "ANTR" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSize4TrueInject() throws Exception {
|
public void testSize4TrueInject() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
|
||||||
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
|
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAlternateInjectFalse() throws Exception {
|
public void testAlternateInjectFalse() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski"));
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Kuczewski"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||||
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
|
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSize8FalseInject() throws Exception {
|
public void testSize8FalseInject() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||||
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
|
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNonConvertableStringsWithInject() throws Exception {
|
public void testNonConvertableStringsWithInject() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
|
||||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNonConvertableStringsWithoutInject() throws Exception {
|
public void testNonConvertableStringsWithoutInject() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
|
||||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||||
|
|
||||||
// should have something after the stream
|
// should have something after the stream
|
||||||
stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello"));
|
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello"));
|
||||||
filter = new DoubleMetaphoneFilter(stream, 8, false);
|
filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
|
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
|
||||||
}
|
}
|
|
@ -0,0 +1,73 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.phonetic;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.Encoder;
|
||||||
|
import org.apache.commons.codec.language.Caverphone;
|
||||||
|
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||||
|
import org.apache.commons.codec.language.Metaphone;
|
||||||
|
import org.apache.commons.codec.language.RefinedSoundex;
|
||||||
|
import org.apache.commons.codec.language.Soundex;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests {@link PhoneticFilter}
|
||||||
|
*/
|
||||||
|
public class TestPhoneticFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void testAlgorithms() throws Exception {
|
||||||
|
assertAlgorithm(new Metaphone(), true, "aaa bbb ccc easgasg",
|
||||||
|
new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" });
|
||||||
|
assertAlgorithm(new Metaphone(), false, "aaa bbb ccc easgasg",
|
||||||
|
new String[] { "A", "B", "KKK", "ESKS" });
|
||||||
|
|
||||||
|
assertAlgorithm(new DoubleMetaphone(), true, "aaa bbb ccc easgasg",
|
||||||
|
new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" });
|
||||||
|
assertAlgorithm(new DoubleMetaphone(), false, "aaa bbb ccc easgasg",
|
||||||
|
new String[] { "A", "PP", "KK", "ASKS" });
|
||||||
|
|
||||||
|
assertAlgorithm(new Soundex(), true, "aaa bbb ccc easgasg",
|
||||||
|
new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" });
|
||||||
|
assertAlgorithm(new Soundex(), false, "aaa bbb ccc easgasg",
|
||||||
|
new String[] { "A000", "B000", "C000", "E220" });
|
||||||
|
|
||||||
|
assertAlgorithm(new RefinedSoundex(), true, "aaa bbb ccc easgasg",
|
||||||
|
new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" });
|
||||||
|
assertAlgorithm(new RefinedSoundex(), false, "aaa bbb ccc easgasg",
|
||||||
|
new String[] { "A0", "B1", "C3", "E034034" });
|
||||||
|
|
||||||
|
assertAlgorithm(new Caverphone(), true, "Darda Karleen Datha Carlene",
|
||||||
|
new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen",
|
||||||
|
"TTA1111111", "Datha", "KLN1111111", "Carlene" });
|
||||||
|
assertAlgorithm(new Caverphone(), false, "Darda Karleen Datha Carlene",
|
||||||
|
new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" });
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void assertAlgorithm(Encoder encoder, boolean inject, String input,
|
||||||
|
String[] expected) throws Exception {
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||||
|
new StringReader(input));
|
||||||
|
PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject);
|
||||||
|
assertTokenStreamContents(filter, expected);
|
||||||
|
}
|
||||||
|
}
|
|
@ -147,6 +147,7 @@
|
||||||
<path id="lucene.classpath">
|
<path id="lucene.classpath">
|
||||||
<pathelement location="${common-solr.dir}/../lucene/build/classes/java" />
|
<pathelement location="${common-solr.dir}/../lucene/build/classes/java" />
|
||||||
<pathelement location="${common-solr.dir}/../modules/analysis/build/common/classes/java" />
|
<pathelement location="${common-solr.dir}/../modules/analysis/build/common/classes/java" />
|
||||||
|
<pathelement location="${common-solr.dir}/../modules/analysis/build/phonetic/classes/java" />
|
||||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/highlighter/classes/java" />
|
<pathelement location="${common-solr.dir}/../lucene/build/contrib/highlighter/classes/java" />
|
||||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/memory/classes/java" />
|
<pathelement location="${common-solr.dir}/../lucene/build/contrib/memory/classes/java" />
|
||||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/misc/classes/java" />
|
<pathelement location="${common-solr.dir}/../lucene/build/contrib/misc/classes/java" />
|
||||||
|
@ -162,6 +163,7 @@
|
||||||
</subant>
|
</subant>
|
||||||
<subant target="jar" inheritall="false" failonerror="true">
|
<subant target="jar" inheritall="false" failonerror="true">
|
||||||
<fileset dir="../modules/analysis/common" includes="build.xml" />
|
<fileset dir="../modules/analysis/common" includes="build.xml" />
|
||||||
|
<fileset dir="../modules/analysis/phonetic" includes="build.xml" />
|
||||||
<fileset dir="../lucene/contrib/highlighter" includes="build.xml" />
|
<fileset dir="../lucene/contrib/highlighter" includes="build.xml" />
|
||||||
<fileset dir="../lucene/contrib/memory" includes="build.xml" />
|
<fileset dir="../lucene/contrib/memory" includes="build.xml" />
|
||||||
<fileset dir="../lucene/contrib/misc" includes="build.xml" />
|
<fileset dir="../lucene/contrib/misc" includes="build.xml" />
|
||||||
|
@ -181,6 +183,9 @@
|
||||||
<fileset dir="../modules/analysis/build/common">
|
<fileset dir="../modules/analysis/build/common">
|
||||||
<include name="lucene-analyzers-common-${version}.jar" />
|
<include name="lucene-analyzers-common-${version}.jar" />
|
||||||
</fileset>
|
</fileset>
|
||||||
|
<fileset dir="../modules/analysis/build/phonetic">
|
||||||
|
<include name="lucene-analyzers-phonetic-${version}.jar" />
|
||||||
|
</fileset>
|
||||||
<fileset dir="../lucene/build/contrib/highlighter">
|
<fileset dir="../lucene/build/contrib/highlighter">
|
||||||
<include name="lucene-highlighter-${version}.jar" />
|
<include name="lucene-highlighter-${version}.jar" />
|
||||||
</fileset>
|
</fileset>
|
||||||
|
@ -206,6 +211,7 @@
|
||||||
<property name="lucene-compiled" value="true"/>
|
<property name="lucene-compiled" value="true"/>
|
||||||
<subant target="default">
|
<subant target="default">
|
||||||
<fileset dir="../modules/analysis/common" includes="build.xml"/>
|
<fileset dir="../modules/analysis/common" includes="build.xml"/>
|
||||||
|
<fileset dir="../modules/analysis/phonetic" includes="build.xml"/>
|
||||||
<fileset dir="../lucene/contrib/highlighter" includes="build.xml"/>
|
<fileset dir="../lucene/contrib/highlighter" includes="build.xml"/>
|
||||||
<fileset dir="../lucene/contrib/memory" includes="build.xml"/>
|
<fileset dir="../lucene/contrib/memory" includes="build.xml"/>
|
||||||
<fileset dir="../lucene/contrib/misc" includes="build.xml"/>
|
<fileset dir="../lucene/contrib/misc" includes="build.xml"/>
|
||||||
|
|
|
@ -17,11 +17,10 @@
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
@ -29,11 +28,7 @@ import java.util.Map;
|
||||||
import java.util.StringTokenizer;
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A filter to apply normal capitalization rules to Tokens. It will make the first letter
|
* Factory for {@link CapitalizationFilter}.
|
||||||
* capital and the rest lower case.
|
|
||||||
* <p/>
|
|
||||||
* This filter is particularly useful to build nice looking facet parameters. This filter
|
|
||||||
* is not appropriate if you intend to use a prefix query.
|
|
||||||
* <p/>
|
* <p/>
|
||||||
* The factory takes parameters:<br/>
|
* The factory takes parameters:<br/>
|
||||||
* "onlyFirstWord" - should each word be capitalized or all of the words?<br/>
|
* "onlyFirstWord" - should each word be capitalized or all of the words?<br/>
|
||||||
|
@ -52,7 +47,6 @@ import java.util.StringTokenizer;
|
||||||
* @since solr 1.3
|
* @since solr 1.3
|
||||||
*/
|
*/
|
||||||
public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
||||||
public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE;
|
|
||||||
public static final String KEEP = "keep";
|
public static final String KEEP = "keep";
|
||||||
public static final String KEEP_IGNORE_CASE = "keepIgnoreCase";
|
public static final String KEEP_IGNORE_CASE = "keepIgnoreCase";
|
||||||
public static final String OK_PREFIX = "okPrefix";
|
public static final String OK_PREFIX = "okPrefix";
|
||||||
|
@ -68,8 +62,8 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
||||||
Collection<char[]> okPrefix = Collections.emptyList(); // for Example: McK
|
Collection<char[]> okPrefix = Collections.emptyList(); // for Example: McK
|
||||||
|
|
||||||
int minWordLength = 0; // don't modify capitalization for words shorter then this
|
int minWordLength = 0; // don't modify capitalization for words shorter then this
|
||||||
int maxWordCount = DEFAULT_MAX_WORD_COUNT;
|
int maxWordCount = CapitalizationFilter.DEFAULT_MAX_WORD_COUNT;
|
||||||
int maxTokenLength = DEFAULT_MAX_WORD_COUNT;
|
int maxTokenLength = CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH;
|
||||||
boolean onlyFirstWord = true;
|
boolean onlyFirstWord = true;
|
||||||
boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is in the keep list
|
boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is in the keep list
|
||||||
|
|
||||||
|
@ -128,116 +122,8 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void processWord(char[] buffer, int offset, int length, int wordCount) {
|
|
||||||
if (length < 1) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (onlyFirstWord && wordCount > 0) {
|
|
||||||
for (int i = 0; i < length; i++) {
|
|
||||||
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
|
|
||||||
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (keep != null && keep.contains(buffer, offset, length)) {
|
|
||||||
if (wordCount == 0 && forceFirstLetter) {
|
|
||||||
buffer[offset] = Character.toUpperCase(buffer[offset]);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (length < minWordLength) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (char[] prefix : okPrefix) {
|
|
||||||
if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix
|
|
||||||
boolean match = true;
|
|
||||||
for (int i = 0; i < prefix.length; i++) {
|
|
||||||
if (prefix[i] != buffer[offset + i]) {
|
|
||||||
match = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (match == true) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We know it has at least one character
|
|
||||||
/*char[] chars = w.toCharArray();
|
|
||||||
StringBuilder word = new StringBuilder( w.length() );
|
|
||||||
word.append( Character.toUpperCase( chars[0] ) );*/
|
|
||||||
buffer[offset] = Character.toUpperCase(buffer[offset]);
|
|
||||||
|
|
||||||
for (int i = 1; i < length; i++) {
|
|
||||||
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
|
|
||||||
}
|
|
||||||
//return word.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public CapitalizationFilter create(TokenStream input) {
|
public CapitalizationFilter create(TokenStream input) {
|
||||||
return new CapitalizationFilter(input, this);
|
return new CapitalizationFilter(input, onlyFirstWord, keep,
|
||||||
|
forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This relies on the Factory so that the difficult stuff does not need to be
|
|
||||||
* re-initialized each time the filter runs.
|
|
||||||
* <p/>
|
|
||||||
* This is package protected since it is not useful without the Factory
|
|
||||||
*/
|
|
||||||
final class CapitalizationFilter extends TokenFilter {
|
|
||||||
private final CapitalizationFilterFactory factory;
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
|
||||||
|
|
||||||
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
|
|
||||||
super(in);
|
|
||||||
this.factory = factory;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean incrementToken() throws IOException {
|
|
||||||
if (!input.incrementToken()) return false;
|
|
||||||
|
|
||||||
char[] termBuffer = termAtt.buffer();
|
|
||||||
int termBufferLength = termAtt.length();
|
|
||||||
char[] backup = null;
|
|
||||||
if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) {
|
|
||||||
//make a backup in case we exceed the word count
|
|
||||||
backup = new char[termBufferLength];
|
|
||||||
System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
|
|
||||||
}
|
|
||||||
if (termBufferLength < factory.maxTokenLength) {
|
|
||||||
int wordCount = 0;
|
|
||||||
|
|
||||||
int lastWordStart = 0;
|
|
||||||
for (int i = 0; i < termBufferLength; i++) {
|
|
||||||
char c = termBuffer[i];
|
|
||||||
if (c <= ' ' || c == '.') {
|
|
||||||
int len = i - lastWordStart;
|
|
||||||
if (len > 0) {
|
|
||||||
factory.processWord(termBuffer, lastWordStart, len, wordCount++);
|
|
||||||
lastWordStart = i + 1;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// process the last word
|
|
||||||
if (lastWordStart < termBufferLength) {
|
|
||||||
factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (wordCount > factory.maxWordCount) {
|
|
||||||
termAtt.copyBuffer(backup, 0, termBufferLength);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.analysis;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
|
||||||
|
|
||||||
public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory
|
public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory
|
||||||
{
|
{
|
||||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.commons.codec.language.Metaphone;
|
||||||
import org.apache.commons.codec.language.RefinedSoundex;
|
import org.apache.commons.codec.language.RefinedSoundex;
|
||||||
import org.apache.commons.codec.language.Soundex;
|
import org.apache.commons.codec.language.Soundex;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
import org.apache.solr.common.util.StrUtils;
|
||||||
|
|
||||||
|
@ -96,6 +97,6 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
|
||||||
}
|
}
|
||||||
|
|
||||||
public PhoneticFilter create(TokenStream input) {
|
public PhoneticFilter create(TokenStream input) {
|
||||||
return new PhoneticFilter(input,encoder,name,inject);
|
return new PhoneticFilter(input,encoder,inject);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
|
public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
|
||||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class TestCapitalizationFilter extends BaseTokenTestCase {
|
public class TestCapitalizationFilterFactory extends BaseTokenTestCase {
|
||||||
|
|
||||||
public void testCapitalization() throws Exception
|
public void testCapitalization() throws Exception
|
||||||
{
|
{
|
||||||
|
@ -40,74 +40,78 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
|
|
||||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||||
factory.init( args );
|
factory.init( args );
|
||||||
char[] termBuffer;
|
assertTokenStreamContents(factory.create(
|
||||||
termBuffer = "kiTTEN".toCharArray();
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kiTTEN"))),
|
||||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
new String[] { "Kitten" });
|
||||||
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
|
|
||||||
|
|
||||||
factory.forceFirstLetter = true;
|
factory.forceFirstLetter = true;
|
||||||
|
|
||||||
termBuffer = "and".toCharArray();
|
assertTokenStreamContents(factory.create(
|
||||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("and"))),
|
||||||
assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced
|
new String[] { "And" });
|
||||||
|
|
||||||
termBuffer = "AnD".toCharArray();
|
//first is forced, but it's not a keep word, either
|
||||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
assertTokenStreamContents(factory.create(
|
||||||
assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced, but it's not a keep word, either
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))),
|
||||||
|
new String[] { "And" });
|
||||||
|
|
||||||
factory.forceFirstLetter = false;
|
factory.forceFirstLetter = false;
|
||||||
termBuffer = "AnD".toCharArray();
|
|
||||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
//first is not forced, but it's not a keep word, either
|
||||||
assertEquals( "And", new String(termBuffer, 0, termBuffer.length)); //first is not forced, but it's not a keep word, either
|
assertTokenStreamContents(factory.create(
|
||||||
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))),
|
||||||
|
new String[] { "And" });
|
||||||
|
|
||||||
factory.forceFirstLetter = true;
|
factory.forceFirstLetter = true;
|
||||||
termBuffer = "big".toCharArray();
|
|
||||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
|
||||||
assertEquals( "Big", new String(termBuffer, 0, termBuffer.length));
|
|
||||||
termBuffer = "BIG".toCharArray();
|
|
||||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
|
||||||
assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length));
|
|
||||||
|
|
||||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"));
|
assertTokenStreamContents(factory.create(
|
||||||
TokenStream stream = factory.create(tokenizer);
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("big"))),
|
||||||
assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" });
|
new String[] { "Big" });
|
||||||
|
|
||||||
|
assertTokenStreamContents(factory.create(
|
||||||
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("BIG"))),
|
||||||
|
new String[] { "BIG" });
|
||||||
|
|
||||||
|
assertTokenStreamContents(factory.create(
|
||||||
|
new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"))),
|
||||||
|
new String[] { "Hello there my name is ryan" });
|
||||||
|
|
||||||
// now each token
|
// now each token
|
||||||
factory.onlyFirstWord = false;
|
factory.onlyFirstWord = false;
|
||||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"));
|
assertTokenStreamContents(factory.create(
|
||||||
stream = factory.create(tokenizer);
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))),
|
||||||
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
|
new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
|
||||||
|
|
||||||
// now only the long words
|
// now only the long words
|
||||||
factory.minWordLength = 3;
|
factory.minWordLength = 3;
|
||||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" ));
|
assertTokenStreamContents(factory.create(
|
||||||
stream = factory.create(tokenizer);
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))),
|
||||||
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
|
new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
|
||||||
|
|
||||||
// without prefix
|
// without prefix
|
||||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
|
assertTokenStreamContents(factory.create(
|
||||||
stream = factory.create(tokenizer);
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))),
|
||||||
assertTokenStreamContents(stream, new String[] { "Mckinley" });
|
new String[] { "Mckinley" });
|
||||||
|
|
||||||
// Now try some prefixes
|
// Now try some prefixes
|
||||||
factory = new CapitalizationFilterFactory();
|
factory = new CapitalizationFilterFactory();
|
||||||
args.put( "okPrefix", "McK" ); // all words
|
args.put( "okPrefix", "McK" ); // all words
|
||||||
factory.init( args );
|
factory.init( args );
|
||||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
|
assertTokenStreamContents(factory.create(
|
||||||
stream = factory.create(tokenizer);
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))),
|
||||||
assertTokenStreamContents(stream, new String[] { "McKinley" });
|
new String[] { "McKinley" });
|
||||||
|
|
||||||
// now try some stuff with numbers
|
// now try some stuff with numbers
|
||||||
factory.forceFirstLetter = false;
|
factory.forceFirstLetter = false;
|
||||||
factory.onlyFirstWord = false;
|
factory.onlyFirstWord = false;
|
||||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" ));
|
assertTokenStreamContents(factory.create(
|
||||||
stream = factory.create(tokenizer);
|
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third"))),
|
||||||
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
|
new String[] { "1st", "2nd", "Third" });
|
||||||
|
|
||||||
factory.forceFirstLetter = true;
|
factory.forceFirstLetter = true;
|
||||||
tokenizer = new KeywordTokenizer(new StringReader("the The the" ));
|
assertTokenStreamContents(factory.create(
|
||||||
stream = factory.create(tokenizer);
|
new KeywordTokenizer(new StringReader("the The the"))),
|
||||||
assertTokenStreamContents(stream, new String[] { "The The the" });
|
new String[] { "The The the" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testKeepIgnoreCase() throws Exception {
|
public void testKeepIgnoreCase() throws Exception {
|
||||||
|
@ -118,21 +122,20 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||||
|
|
||||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||||
factory.init( args );
|
factory.init( args );
|
||||||
char[] termBuffer;
|
|
||||||
termBuffer = "kiTTEN".toCharArray();
|
|
||||||
factory.forceFirstLetter = true;
|
factory.forceFirstLetter = true;
|
||||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
assertTokenStreamContents(factory.create(
|
||||||
assertEquals( "KiTTEN", new String(termBuffer, 0, termBuffer.length));
|
new KeywordTokenizer(new StringReader("kiTTEN"))),
|
||||||
|
new String[] { "KiTTEN" });
|
||||||
|
|
||||||
factory.forceFirstLetter = false;
|
factory.forceFirstLetter = false;
|
||||||
termBuffer = "kiTTEN".toCharArray();
|
assertTokenStreamContents(factory.create(
|
||||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
new KeywordTokenizer(new StringReader("kiTTEN"))),
|
||||||
assertEquals( "kiTTEN", new String(termBuffer, 0, termBuffer.length));
|
new String[] { "kiTTEN" });
|
||||||
|
|
||||||
factory.keep = null;
|
factory.keep = null;
|
||||||
termBuffer = "kiTTEN".toCharArray();
|
assertTokenStreamContents(factory.create(
|
||||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
new KeywordTokenizer(new StringReader("kiTTEN"))),
|
||||||
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
|
new String[] { "Kitten" });
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class TestPhoneticFilter extends BaseTokenTestCase {
|
public class TestPhoneticFilterFactory extends BaseTokenTestCase {
|
||||||
|
|
||||||
public void testFactory()
|
public void testFactory()
|
||||||
{
|
{
|
Loading…
Reference in New Issue