mirror of https://github.com/apache/lucene.git
LUCENE-2413: consolidate remaining solr tokenstreams into modules/analysis
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@957162 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
653c7c160b
commit
8f71031ac8
|
@ -27,11 +27,14 @@ New Features
|
|||
with text contained in the required words (inverse of StopFilter).
|
||||
- o.a.l.analysis.miscellaneous.HyphenatedWordsFilter: A TokenFilter that puts
|
||||
hyphenated words broken into two lines back together.
|
||||
- o.a.l.analysis.miscellaneous.CapitalizationFilter: A TokenFilter that applies
|
||||
capitalization rules to tokens.
|
||||
- o.a.l.analysis.pattern: Package for pattern-based analysis, containing a
|
||||
CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
|
||||
- o.a.l.analysis.synonym.SynonymFilter: A synonym filter that supports multi-word
|
||||
synonyms.
|
||||
(... in progress)
|
||||
- o.a.l.analysis.phonetic: Package for phonetic search, containing various
|
||||
phonetic encoders such as Double Metaphone.
|
||||
|
||||
* LUCENE-2413: Consolidated all Lucene analyzers into common.
|
||||
- o.a.l.analysis.KeywordAnalyzer -> o.a.l.analysis.core.KeywordAnalyzer
|
||||
|
@ -60,7 +63,6 @@ New Features
|
|||
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
|
||||
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
|
||||
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
|
||||
... (in progress)
|
||||
|
||||
Build
|
||||
|
||||
|
|
|
@ -4,6 +4,10 @@ Copyright 2006 The Apache Software Foundation
|
|||
This product includes software developed by
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
||||
|
||||
Includes software from other Apache Software Foundation projects,
|
||||
including, but not limited to:
|
||||
- Apache Commons
|
||||
|
||||
The snowball stemmers in
|
||||
common/src/java/net/sf/snowball
|
||||
were developed by Martin Porter and Richard Boulton.
|
||||
|
|
|
@ -20,7 +20,12 @@ lucene-analyzers-common-XX.jar
|
|||
lucene-analyzers-icu-XX.jar
|
||||
An add-on analysis library that provides improved Unicode support via
|
||||
International Components for Unicode (ICU). Note: this module depends on
|
||||
the ICU4j jar file (version > 4.4.0)
|
||||
the ICU4j jar file (version >= 4.4.0)
|
||||
|
||||
lucene-analyzers-phonetic-XX.jar
|
||||
An add-on analysis library that provides phonetic encoders via Apache
|
||||
Commons-Codec. Note: this module depends on the commons-codec jar
|
||||
file (version >= 1.4)
|
||||
|
||||
lucene-analyzers-smartcn-XX.jar
|
||||
An add-on analysis library that provides word segmentation for Simplified
|
||||
|
@ -32,12 +37,14 @@ lucene-analyzers-stempel-XX.jar
|
|||
|
||||
common/src/java
|
||||
icu/src/java
|
||||
phonetic/src/java
|
||||
smartcn/src/java
|
||||
stempel/src/java
|
||||
The source code for the four libraries.
|
||||
The source code for the ffve libraries.
|
||||
|
||||
common/src/test
|
||||
icu/src/test
|
||||
phonetic/src/test
|
||||
smartcn/src/test
|
||||
stempel/src/test
|
||||
Unit tests for the four libraries.
|
||||
Unit tests for the five libraries.
|
||||
|
|
|
@ -35,6 +35,10 @@
|
|||
<ant dir="icu" />
|
||||
</target>
|
||||
|
||||
<target name="phonetic">
|
||||
<ant dir="phonetic" />
|
||||
</target>
|
||||
|
||||
<target name="smartcn">
|
||||
<ant dir="smartcn" />
|
||||
</target>
|
||||
|
@ -44,29 +48,33 @@
|
|||
</target>
|
||||
|
||||
<target name="default" depends="compile"/>
|
||||
<target name="compile" depends="common,icu,smartcn,stempel" />
|
||||
<target name="compile" depends="common,icu,phonetic,smartcn,stempel" />
|
||||
|
||||
<target name="clean">
|
||||
<ant dir="common" target="clean" />
|
||||
<ant dir="icu" target="clean" />
|
||||
<ant dir="phonetic" target="clean" />
|
||||
<ant dir="smartcn" target="clean" />
|
||||
<ant dir="stempel" target="clean" />
|
||||
</target>
|
||||
<target name="compile-core">
|
||||
<ant dir="common" target="compile-core" />
|
||||
<ant dir="icu" target="compile-core" />
|
||||
<ant dir="phonetic" target="compile-core" />
|
||||
<ant dir="smartcn" target="compile-core" />
|
||||
<ant dir="stempel" target="compile-core" />
|
||||
</target>
|
||||
<target name="compile-test">
|
||||
<ant dir="common" target="compile-test" />
|
||||
<ant dir="icu" target="compile-test" />
|
||||
<ant dir="phonetic" target="compile-test" />
|
||||
<ant dir="smartcn" target="compile-test" />
|
||||
<ant dir="stempel" target="compile-test" />
|
||||
</target>
|
||||
<target name="test">
|
||||
<ant dir="common" target="test" />
|
||||
<ant dir="icu" target="test" />
|
||||
<ant dir="phonetic" target="test" />
|
||||
<ant dir="smartcn" target="test" />
|
||||
<ant dir="stempel" target="test" />
|
||||
</target>
|
||||
|
@ -76,6 +84,7 @@
|
|||
<target name="dist-maven" depends="default">
|
||||
<ant dir="common" target="dist-maven" />
|
||||
<ant dir="icu" target="dist-maven" />
|
||||
<ant dir="phonetic" target="dist-maven" />
|
||||
<ant dir="smartcn" target="dist-maven" />
|
||||
<ant dir="stempel" target="dist-maven" />
|
||||
</target>
|
||||
|
@ -83,6 +92,7 @@
|
|||
<target name="javadocs">
|
||||
<ant dir="common" target="javadocs" />
|
||||
<ant dir="icu" target="javadocs" />
|
||||
<ant dir="phonetic" target="javadocs" />
|
||||
<ant dir="smartcn" target="javadocs" />
|
||||
<ant dir="stempel" target="javadocs" />
|
||||
</target>
|
||||
|
@ -90,6 +100,7 @@
|
|||
<target name="javadocs-index.html">
|
||||
<ant dir="common" target="javadocs-index.html" />
|
||||
<ant dir="icu" target="javadocs-index.html" />
|
||||
<ant dir="phonetic" target="javadocs-index.html" />
|
||||
<ant dir="smartcn" target="javadocs-index.html" />
|
||||
<ant dir="stempel" target="javadocs-index.html" />
|
||||
</target>
|
||||
|
|
|
@ -0,0 +1,181 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
* A filter to apply normal capitalization rules to Tokens. It will make the first letter
|
||||
* capital and the rest lower case.
|
||||
* <p/>
|
||||
* This filter is particularly useful to build nice looking facet parameters. This filter
|
||||
* is not appropriate if you intend to use a prefix query.
|
||||
*/
|
||||
public final class CapitalizationFilter extends TokenFilter {
|
||||
public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE;
|
||||
public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
|
||||
|
||||
private final boolean onlyFirstWord;
|
||||
private final CharArraySet keep;
|
||||
private final boolean forceFirstLetter;
|
||||
private final Collection<char[]> okPrefix;
|
||||
|
||||
private final int minWordLength;
|
||||
private final int maxWordCount;
|
||||
private final int maxTokenLength;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates a CapitalizationFilter with the default parameters.
|
||||
* <p>
|
||||
* Calls {@link #CapitalizationFilter(TokenStream, boolean, CharArraySet, boolean, Collection, int, int, int)
|
||||
* CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH)}
|
||||
*/
|
||||
public CapitalizationFilter(TokenStream in) {
|
||||
this(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a CapitalizationFilter with the specified parameters.
|
||||
* @param in input tokenstream
|
||||
* @param onlyFirstWord should each word be capitalized or all of the words?
|
||||
* @param keep a keep word list. Each word that should be kept separated by whitespace.
|
||||
* @param forceFirstLetter Force the first letter to be capitalized even if it is in the keep list.
|
||||
* @param okPrefix do not change word capitalization if a word begins with something in this list.
|
||||
* @param minWordLength how long the word needs to be to get capitalization applied. If the
|
||||
* minWordLength is 3, "and" > "And" but "or" stays "or".
|
||||
* @param maxWordCount if the token contains more then maxWordCount words, the capitalization is
|
||||
* assumed to be correct.
|
||||
* @param maxTokenLength ???
|
||||
*/
|
||||
public CapitalizationFilter(TokenStream in, boolean onlyFirstWord, CharArraySet keep,
|
||||
boolean forceFirstLetter, Collection<char[]> okPrefix, int minWordLength,
|
||||
int maxWordCount, int maxTokenLength) {
|
||||
super(in);
|
||||
this.onlyFirstWord = onlyFirstWord;
|
||||
this.keep = keep;
|
||||
this.forceFirstLetter = forceFirstLetter;
|
||||
this.okPrefix = okPrefix;
|
||||
this.minWordLength = minWordLength;
|
||||
this.maxWordCount = maxWordCount;
|
||||
this.maxTokenLength = maxTokenLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) return false;
|
||||
|
||||
char[] termBuffer = termAtt.buffer();
|
||||
int termBufferLength = termAtt.length();
|
||||
char[] backup = null;
|
||||
|
||||
if (maxWordCount < DEFAULT_MAX_WORD_COUNT) {
|
||||
//make a backup in case we exceed the word count
|
||||
backup = new char[termBufferLength];
|
||||
System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
|
||||
}
|
||||
|
||||
if (termBufferLength < maxTokenLength) {
|
||||
int wordCount = 0;
|
||||
|
||||
int lastWordStart = 0;
|
||||
for (int i = 0; i < termBufferLength; i++) {
|
||||
char c = termBuffer[i];
|
||||
if (c <= ' ' || c == '.') {
|
||||
int len = i - lastWordStart;
|
||||
if (len > 0) {
|
||||
processWord(termBuffer, lastWordStart, len, wordCount++);
|
||||
lastWordStart = i + 1;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// process the last word
|
||||
if (lastWordStart < termBufferLength) {
|
||||
processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
|
||||
}
|
||||
|
||||
if (wordCount > maxWordCount) {
|
||||
termAtt.copyBuffer(backup, 0, termBufferLength);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private void processWord(char[] buffer, int offset, int length, int wordCount) {
|
||||
if (length < 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (onlyFirstWord && wordCount > 0) {
|
||||
for (int i = 0; i < length; i++) {
|
||||
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
|
||||
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (keep != null && keep.contains(buffer, offset, length)) {
|
||||
if (wordCount == 0 && forceFirstLetter) {
|
||||
buffer[offset] = Character.toUpperCase(buffer[offset]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (length < minWordLength) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (okPrefix != null) {
|
||||
for (char[] prefix : okPrefix) {
|
||||
if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix
|
||||
boolean match = true;
|
||||
for (int i = 0; i < prefix.length; i++) {
|
||||
if (prefix[i] != buffer[offset + i]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (match == true) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We know it has at least one character
|
||||
/*char[] chars = w.toCharArray();
|
||||
StringBuilder word = new StringBuilder( w.length() );
|
||||
word.append( Character.toUpperCase( chars[0] ) );*/
|
||||
buffer[offset] = Character.toUpperCase(buffer[offset]);
|
||||
|
||||
for (int i = 1; i < length; i++) {
|
||||
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
|
||||
}
|
||||
//return word.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*;
|
||||
|
||||
/** Tests {@link CapitalizationFilter} */
|
||||
public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
|
||||
public void testCapitalization() throws Exception {
|
||||
CharArraySet keep = new CharArraySet(TEST_VERSION_CURRENT,
|
||||
Arrays.asList("and", "the", "it", "BIG"), false);
|
||||
|
||||
assertCapitalizesTo("kiTTEN", new String[] { "Kitten" },
|
||||
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
assertCapitalizesTo("and", new String[] { "And" },
|
||||
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
assertCapitalizesTo("AnD", new String[] { "And" },
|
||||
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
//first is not forced, but it's not a keep word, either
|
||||
assertCapitalizesTo("AnD", new String[] { "And" },
|
||||
true, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
assertCapitalizesTo("big", new String[] { "Big" },
|
||||
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
assertCapitalizesTo("BIG", new String[] { "BIG" },
|
||||
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
assertCapitalizesToKeyword("Hello thEre my Name is Ryan", "Hello there my name is ryan",
|
||||
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
// now each token
|
||||
assertCapitalizesTo("Hello thEre my Name is Ryan",
|
||||
new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" },
|
||||
false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
// now only the long words
|
||||
assertCapitalizesTo("Hello thEre my Name is Ryan",
|
||||
new String[] { "Hello", "There", "my", "Name", "is", "Ryan" },
|
||||
false, keep, true, null, 3, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
// without prefix
|
||||
assertCapitalizesTo("McKinley",
|
||||
new String[] { "Mckinley" },
|
||||
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
// Now try some prefixes
|
||||
List<char[]> okPrefix = new ArrayList<char[]>();
|
||||
okPrefix.add("McK".toCharArray());
|
||||
|
||||
assertCapitalizesTo("McKinley",
|
||||
new String[] { "McKinley" },
|
||||
true, keep, true, okPrefix, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
// now try some stuff with numbers
|
||||
assertCapitalizesTo("1st 2nd third",
|
||||
new String[] { "1st", "2nd", "Third" },
|
||||
false, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
|
||||
assertCapitalizesToKeyword("the The the", "The The the",
|
||||
false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
|
||||
}
|
||||
|
||||
static void assertCapitalizesTo(Tokenizer tokenizer, String expected[],
|
||||
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
|
||||
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
|
||||
int maxTokenLength) throws IOException {
|
||||
CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep,
|
||||
forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
|
||||
assertTokenStreamContents(filter, expected);
|
||||
}
|
||||
|
||||
static void assertCapitalizesTo(String input, String expected[],
|
||||
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
|
||||
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
|
||||
int maxTokenLength) throws IOException {
|
||||
assertCapitalizesTo(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength,
|
||||
maxWordCount, maxTokenLength);
|
||||
}
|
||||
|
||||
static void assertCapitalizesToKeyword(String input, String expected,
|
||||
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
|
||||
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
|
||||
int maxTokenLength) throws IOException {
|
||||
assertCapitalizesTo(new KeywordTokenizer(new StringReader(input)),
|
||||
new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
|
||||
minWordLength, maxWordCount, maxTokenLength);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="analyzers-phonetic" default="default">
|
||||
|
||||
<description>
|
||||
Provides phonetic encoding support via Apache Commons Codec.
|
||||
</description>
|
||||
|
||||
<property name="build.dir" location="../build/phonetic" />
|
||||
<property name="dist.dir" location="../dist/phonetic" />
|
||||
|
||||
<path id="additional.dependencies">
|
||||
<fileset dir="lib" includes="commons-codec-*.jar"/>
|
||||
</path>
|
||||
|
||||
<pathconvert property="project.classpath"
|
||||
targetos="unix"
|
||||
refid="additional.dependencies"
|
||||
/>
|
||||
|
||||
<import file="../../../lucene/contrib/contrib-build.xml"/>
|
||||
|
||||
<module-uptodate name="analysis/common" jarfile="../build/common/lucene-analyzers-common-${version}.jar"
|
||||
property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${analyzers-common.jar}"/>
|
||||
<path refid="base.classpath"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<pathelement path="${analyzers-common.jar}"/>
|
||||
<path refid="classpath"/>
|
||||
<pathelement location="../../../lucene/build/classes/test/"/>
|
||||
<pathelement location="../build/common/classes/test/"/>
|
||||
<path refid="junit-path"/>
|
||||
<pathelement location="${build.dir}/classes/java"/>
|
||||
</path>
|
||||
|
||||
<target name="compile-core" depends="build-analyzers-common, common.compile-core" />
|
||||
|
||||
<target name="build-analyzers-common" unless="analyzers-common.uptodate">
|
||||
<echo>phonetic building dependency ${analyzers-common.jar}</echo>
|
||||
<ant antfile="../common/build.xml" target="default" inheritall="false" dir="../common" />
|
||||
</target>
|
||||
</project>
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[458d432da88b0efeab640c229903fb5aad274044] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,46 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-contrib</artifactId>
|
||||
<version>@version@</version>
|
||||
</parent>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-phonetic</artifactId>
|
||||
<name>
|
||||
Lucene Phonetic Filters
|
||||
</name>
|
||||
<version>@version@</version>
|
||||
<description>
|
||||
Provides phonetic encoding via Commons Codec.
|
||||
</description>
|
||||
<packaging>jar</packaging>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>codec</artifactId>
|
||||
<version>${codec-version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.phonetic;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
|
@ -35,7 +35,7 @@ public final class DoubleMetaphoneFilter extends TokenFilter {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
|
||||
public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
|
||||
super(input);
|
||||
this.encoder.setMaxCodeLen(maxCodeLength);
|
||||
this.inject = inject;
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.phonetic;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
@ -28,23 +28,19 @@ import java.io.IOException;
|
|||
/**
|
||||
* Create tokens for phonetic matches. See:
|
||||
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public final class PhoneticFilter extends TokenFilter
|
||||
{
|
||||
protected boolean inject = true;
|
||||
protected Encoder encoder = null;
|
||||
protected String name = null;
|
||||
|
||||
protected State save = null;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
|
||||
public PhoneticFilter(TokenStream in, Encoder encoder, boolean inject) {
|
||||
super(in);
|
||||
this.encoder = encoder;
|
||||
this.name = name;
|
||||
this.inject = inject;
|
||||
}
|
||||
|
|
@ -14,52 +14,53 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.phonetic;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
|
||||
public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testSize4FalseInject() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||
assertTokenStreamContents(filter, new String[] { "ANTR" });
|
||||
}
|
||||
|
||||
public void testSize4TrueInject() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
|
||||
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
|
||||
}
|
||||
|
||||
public void testAlternateInjectFalse() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski"));
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Kuczewski"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
|
||||
}
|
||||
|
||||
public void testSize8FalseInject() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
|
||||
}
|
||||
|
||||
public void testNonConvertableStringsWithInject() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||
}
|
||||
|
||||
public void testNonConvertableStringsWithoutInject() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||
|
||||
// should have something after the stream
|
||||
stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello"));
|
||||
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello"));
|
||||
filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.phonetic;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.commons.codec.language.Caverphone;
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
import org.apache.commons.codec.language.Metaphone;
|
||||
import org.apache.commons.codec.language.RefinedSoundex;
|
||||
import org.apache.commons.codec.language.Soundex;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Tests {@link PhoneticFilter}
|
||||
*/
|
||||
public class TestPhoneticFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testAlgorithms() throws Exception {
|
||||
assertAlgorithm(new Metaphone(), true, "aaa bbb ccc easgasg",
|
||||
new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" });
|
||||
assertAlgorithm(new Metaphone(), false, "aaa bbb ccc easgasg",
|
||||
new String[] { "A", "B", "KKK", "ESKS" });
|
||||
|
||||
assertAlgorithm(new DoubleMetaphone(), true, "aaa bbb ccc easgasg",
|
||||
new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" });
|
||||
assertAlgorithm(new DoubleMetaphone(), false, "aaa bbb ccc easgasg",
|
||||
new String[] { "A", "PP", "KK", "ASKS" });
|
||||
|
||||
assertAlgorithm(new Soundex(), true, "aaa bbb ccc easgasg",
|
||||
new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" });
|
||||
assertAlgorithm(new Soundex(), false, "aaa bbb ccc easgasg",
|
||||
new String[] { "A000", "B000", "C000", "E220" });
|
||||
|
||||
assertAlgorithm(new RefinedSoundex(), true, "aaa bbb ccc easgasg",
|
||||
new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" });
|
||||
assertAlgorithm(new RefinedSoundex(), false, "aaa bbb ccc easgasg",
|
||||
new String[] { "A0", "B1", "C3", "E034034" });
|
||||
|
||||
assertAlgorithm(new Caverphone(), true, "Darda Karleen Datha Carlene",
|
||||
new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen",
|
||||
"TTA1111111", "Datha", "KLN1111111", "Carlene" });
|
||||
assertAlgorithm(new Caverphone(), false, "Darda Karleen Datha Carlene",
|
||||
new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" });
|
||||
}
|
||||
|
||||
|
||||
static void assertAlgorithm(Encoder encoder, boolean inject, String input,
|
||||
String[] expected) throws Exception {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader(input));
|
||||
PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject);
|
||||
assertTokenStreamContents(filter, expected);
|
||||
}
|
||||
}
|
|
@ -147,6 +147,7 @@
|
|||
<path id="lucene.classpath">
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../modules/analysis/build/common/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../modules/analysis/build/phonetic/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/highlighter/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/memory/classes/java" />
|
||||
<pathelement location="${common-solr.dir}/../lucene/build/contrib/misc/classes/java" />
|
||||
|
@ -162,6 +163,7 @@
|
|||
</subant>
|
||||
<subant target="jar" inheritall="false" failonerror="true">
|
||||
<fileset dir="../modules/analysis/common" includes="build.xml" />
|
||||
<fileset dir="../modules/analysis/phonetic" includes="build.xml" />
|
||||
<fileset dir="../lucene/contrib/highlighter" includes="build.xml" />
|
||||
<fileset dir="../lucene/contrib/memory" includes="build.xml" />
|
||||
<fileset dir="../lucene/contrib/misc" includes="build.xml" />
|
||||
|
@ -181,6 +183,9 @@
|
|||
<fileset dir="../modules/analysis/build/common">
|
||||
<include name="lucene-analyzers-common-${version}.jar" />
|
||||
</fileset>
|
||||
<fileset dir="../modules/analysis/build/phonetic">
|
||||
<include name="lucene-analyzers-phonetic-${version}.jar" />
|
||||
</fileset>
|
||||
<fileset dir="../lucene/build/contrib/highlighter">
|
||||
<include name="lucene-highlighter-${version}.jar" />
|
||||
</fileset>
|
||||
|
@ -206,6 +211,7 @@
|
|||
<property name="lucene-compiled" value="true"/>
|
||||
<subant target="default">
|
||||
<fileset dir="../modules/analysis/common" includes="build.xml"/>
|
||||
<fileset dir="../modules/analysis/phonetic" includes="build.xml"/>
|
||||
<fileset dir="../lucene/contrib/highlighter" includes="build.xml"/>
|
||||
<fileset dir="../lucene/contrib/memory" includes="build.xml"/>
|
||||
<fileset dir="../lucene/contrib/misc" includes="build.xml"/>
|
||||
|
|
|
@ -17,11 +17,10 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
@ -29,11 +28,7 @@ import java.util.Map;
|
|||
import java.util.StringTokenizer;
|
||||
|
||||
/**
|
||||
* A filter to apply normal capitalization rules to Tokens. It will make the first letter
|
||||
* capital and the rest lower case.
|
||||
* <p/>
|
||||
* This filter is particularly useful to build nice looking facet parameters. This filter
|
||||
* is not appropriate if you intend to use a prefix query.
|
||||
* Factory for {@link CapitalizationFilter}.
|
||||
* <p/>
|
||||
* The factory takes parameters:<br/>
|
||||
* "onlyFirstWord" - should each word be capitalized or all of the words?<br/>
|
||||
|
@ -52,7 +47,6 @@ import java.util.StringTokenizer;
|
|||
* @since solr 1.3
|
||||
*/
|
||||
public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
||||
public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE;
|
||||
public static final String KEEP = "keep";
|
||||
public static final String KEEP_IGNORE_CASE = "keepIgnoreCase";
|
||||
public static final String OK_PREFIX = "okPrefix";
|
||||
|
@ -68,8 +62,8 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
|||
Collection<char[]> okPrefix = Collections.emptyList(); // for Example: McK
|
||||
|
||||
int minWordLength = 0; // don't modify capitalization for words shorter then this
|
||||
int maxWordCount = DEFAULT_MAX_WORD_COUNT;
|
||||
int maxTokenLength = DEFAULT_MAX_WORD_COUNT;
|
||||
int maxWordCount = CapitalizationFilter.DEFAULT_MAX_WORD_COUNT;
|
||||
int maxTokenLength = CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
boolean onlyFirstWord = true;
|
||||
boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is in the keep list
|
||||
|
||||
|
@ -128,116 +122,8 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
public void processWord(char[] buffer, int offset, int length, int wordCount) {
|
||||
if (length < 1) {
|
||||
return;
|
||||
}
|
||||
if (onlyFirstWord && wordCount > 0) {
|
||||
for (int i = 0; i < length; i++) {
|
||||
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
|
||||
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (keep != null && keep.contains(buffer, offset, length)) {
|
||||
if (wordCount == 0 && forceFirstLetter) {
|
||||
buffer[offset] = Character.toUpperCase(buffer[offset]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (length < minWordLength) {
|
||||
return;
|
||||
}
|
||||
for (char[] prefix : okPrefix) {
|
||||
if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix
|
||||
boolean match = true;
|
||||
for (int i = 0; i < prefix.length; i++) {
|
||||
if (prefix[i] != buffer[offset + i]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (match == true) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We know it has at least one character
|
||||
/*char[] chars = w.toCharArray();
|
||||
StringBuilder word = new StringBuilder( w.length() );
|
||||
word.append( Character.toUpperCase( chars[0] ) );*/
|
||||
buffer[offset] = Character.toUpperCase(buffer[offset]);
|
||||
|
||||
for (int i = 1; i < length; i++) {
|
||||
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
|
||||
}
|
||||
//return word.toString();
|
||||
}
|
||||
|
||||
public CapitalizationFilter create(TokenStream input) {
|
||||
return new CapitalizationFilter(input, this);
|
||||
return new CapitalizationFilter(input, onlyFirstWord, keep,
|
||||
forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This relies on the Factory so that the difficult stuff does not need to be
|
||||
* re-initialized each time the filter runs.
|
||||
* <p/>
|
||||
* This is package protected since it is not useful without the Factory
|
||||
*/
|
||||
final class CapitalizationFilter extends TokenFilter {
|
||||
private final CapitalizationFilterFactory factory;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
|
||||
super(in);
|
||||
this.factory = factory;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) return false;
|
||||
|
||||
char[] termBuffer = termAtt.buffer();
|
||||
int termBufferLength = termAtt.length();
|
||||
char[] backup = null;
|
||||
if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) {
|
||||
//make a backup in case we exceed the word count
|
||||
backup = new char[termBufferLength];
|
||||
System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
|
||||
}
|
||||
if (termBufferLength < factory.maxTokenLength) {
|
||||
int wordCount = 0;
|
||||
|
||||
int lastWordStart = 0;
|
||||
for (int i = 0; i < termBufferLength; i++) {
|
||||
char c = termBuffer[i];
|
||||
if (c <= ' ' || c == '.') {
|
||||
int len = i - lastWordStart;
|
||||
if (len > 0) {
|
||||
factory.processWord(termBuffer, lastWordStart, len, wordCount++);
|
||||
lastWordStart = i + 1;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// process the last word
|
||||
if (lastWordStart < termBufferLength) {
|
||||
factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
|
||||
}
|
||||
|
||||
if (wordCount > factory.maxWordCount) {
|
||||
termAtt.copyBuffer(backup, 0, termBufferLength);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.analysis;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
|
||||
|
||||
public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory
|
||||
{
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.commons.codec.language.Metaphone;
|
|||
import org.apache.commons.codec.language.RefinedSoundex;
|
||||
import org.apache.commons.codec.language.Soundex;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
|
||||
|
@ -96,6 +97,6 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
|
|||
}
|
||||
|
||||
public PhoneticFilter create(TokenStream input) {
|
||||
return new PhoneticFilter(input,encoder,name,inject);
|
||||
return new PhoneticFilter(input,encoder,inject);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
|||
/**
|
||||
*
|
||||
*/
|
||||
public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||
public class TestCapitalizationFilterFactory extends BaseTokenTestCase {
|
||||
|
||||
public void testCapitalization() throws Exception
|
||||
{
|
||||
|
@ -40,74 +40,78 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init( args );
|
||||
char[] termBuffer;
|
||||
termBuffer = "kiTTEN".toCharArray();
|
||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
|
||||
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kiTTEN"))),
|
||||
new String[] { "Kitten" });
|
||||
|
||||
factory.forceFirstLetter = true;
|
||||
|
||||
termBuffer = "and".toCharArray();
|
||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("and"))),
|
||||
new String[] { "And" });
|
||||
|
||||
termBuffer = "AnD".toCharArray();
|
||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced, but it's not a keep word, either
|
||||
//first is forced, but it's not a keep word, either
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))),
|
||||
new String[] { "And" });
|
||||
|
||||
factory.forceFirstLetter = false;
|
||||
termBuffer = "AnD".toCharArray();
|
||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "And", new String(termBuffer, 0, termBuffer.length)); //first is not forced, but it's not a keep word, either
|
||||
|
||||
//first is not forced, but it's not a keep word, either
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))),
|
||||
new String[] { "And" });
|
||||
|
||||
factory.forceFirstLetter = true;
|
||||
termBuffer = "big".toCharArray();
|
||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "Big", new String(termBuffer, 0, termBuffer.length));
|
||||
termBuffer = "BIG".toCharArray();
|
||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length));
|
||||
|
||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" });
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("big"))),
|
||||
new String[] { "Big" });
|
||||
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("BIG"))),
|
||||
new String[] { "BIG" });
|
||||
|
||||
assertTokenStreamContents(factory.create(
|
||||
new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"))),
|
||||
new String[] { "Hello there my name is ryan" });
|
||||
|
||||
// now each token
|
||||
factory.onlyFirstWord = false;
|
||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))),
|
||||
new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
|
||||
|
||||
// now only the long words
|
||||
factory.minWordLength = 3;
|
||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))),
|
||||
new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
|
||||
|
||||
// without prefix
|
||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Mckinley" });
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))),
|
||||
new String[] { "Mckinley" });
|
||||
|
||||
// Now try some prefixes
|
||||
factory = new CapitalizationFilterFactory();
|
||||
args.put( "okPrefix", "McK" ); // all words
|
||||
factory.init( args );
|
||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "McKinley" });
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))),
|
||||
new String[] { "McKinley" });
|
||||
|
||||
// now try some stuff with numbers
|
||||
factory.forceFirstLetter = false;
|
||||
factory.onlyFirstWord = false;
|
||||
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
|
||||
assertTokenStreamContents(factory.create(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third"))),
|
||||
new String[] { "1st", "2nd", "Third" });
|
||||
|
||||
factory.forceFirstLetter = true;
|
||||
tokenizer = new KeywordTokenizer(new StringReader("the The the" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "The The the" });
|
||||
factory.forceFirstLetter = true;
|
||||
assertTokenStreamContents(factory.create(
|
||||
new KeywordTokenizer(new StringReader("the The the"))),
|
||||
new String[] { "The The the" });
|
||||
}
|
||||
|
||||
public void testKeepIgnoreCase() throws Exception {
|
||||
|
@ -118,21 +122,20 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init( args );
|
||||
char[] termBuffer;
|
||||
termBuffer = "kiTTEN".toCharArray();
|
||||
factory.forceFirstLetter = true;
|
||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "KiTTEN", new String(termBuffer, 0, termBuffer.length));
|
||||
assertTokenStreamContents(factory.create(
|
||||
new KeywordTokenizer(new StringReader("kiTTEN"))),
|
||||
new String[] { "KiTTEN" });
|
||||
|
||||
factory.forceFirstLetter = false;
|
||||
termBuffer = "kiTTEN".toCharArray();
|
||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "kiTTEN", new String(termBuffer, 0, termBuffer.length));
|
||||
assertTokenStreamContents(factory.create(
|
||||
new KeywordTokenizer(new StringReader("kiTTEN"))),
|
||||
new String[] { "kiTTEN" });
|
||||
|
||||
factory.keep = null;
|
||||
termBuffer = "kiTTEN".toCharArray();
|
||||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
|
||||
assertTokenStreamContents(factory.create(
|
||||
new KeywordTokenizer(new StringReader("kiTTEN"))),
|
||||
new String[] { "Kitten" });
|
||||
}
|
||||
|
||||
/**
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
|||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
public class TestPhoneticFilter extends BaseTokenTestCase {
|
||||
public class TestPhoneticFilterFactory extends BaseTokenTestCase {
|
||||
|
||||
public void testFactory()
|
||||
{
|
Loading…
Reference in New Issue