LUCENE-2413: consolidate remaining solr tokenstreams into modules/analysis

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@957162 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-06-23 11:25:17 +00:00
parent 653c7c160b
commit 8f71031ac8
20 changed files with 604 additions and 199 deletions

View File

@ -27,11 +27,14 @@ New Features
with text contained in the required words (inverse of StopFilter).
- o.a.l.analysis.miscellaneous.HyphenatedWordsFilter: A TokenFilter that puts
hyphenated words broken into two lines back together.
- o.a.l.analysis.miscellaneous.CapitalizationFilter: A TokenFilter that applies
capitalization rules to tokens.
- o.a.l.analysis.pattern: Package for pattern-based analysis, containing a
CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
- o.a.l.analysis.synonym.SynonymFilter: A synonym filter that supports multi-word
synonyms.
(... in progress)
- o.a.l.analysis.phonetic: Package for phonetic search, containing various
phonetic encoders such as Double Metaphone.
* LUCENE-2413: Consolidated all Lucene analyzers into common.
- o.a.l.analysis.KeywordAnalyzer -> o.a.l.analysis.core.KeywordAnalyzer
@ -60,7 +63,6 @@ New Features
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
... (in progress)
Build

View File

@ -4,6 +4,10 @@ Copyright 2006 The Apache Software Foundation
This product includes software developed by
The Apache Software Foundation (http://www.apache.org/).
Includes software from other Apache Software Foundation projects,
including, but not limited to:
- Apache Commons
The snowball stemmers in
common/src/java/net/sf/snowball
were developed by Martin Porter and Richard Boulton.

View File

@ -20,7 +20,12 @@ lucene-analyzers-common-XX.jar
lucene-analyzers-icu-XX.jar
An add-on analysis library that provides improved Unicode support via
International Components for Unicode (ICU). Note: this module depends on
the ICU4j jar file (version > 4.4.0)
the ICU4j jar file (version >= 4.4.0)
lucene-analyzers-phonetic-XX.jar
An add-on analysis library that provides phonetic encoders via Apache
Commons-Codec. Note: this module depends on the commons-codec jar
file (version >= 1.4)
lucene-analyzers-smartcn-XX.jar
An add-on analysis library that provides word segmentation for Simplified
@ -32,12 +37,14 @@ lucene-analyzers-stempel-XX.jar
common/src/java
icu/src/java
phonetic/src/java
smartcn/src/java
stempel/src/java
The source code for the four libraries.
The source code for the ffve libraries.
common/src/test
icu/src/test
phonetic/src/test
smartcn/src/test
stempel/src/test
Unit tests for the four libraries.
Unit tests for the five libraries.

View File

@ -35,6 +35,10 @@
<ant dir="icu" />
</target>
<target name="phonetic">
<ant dir="phonetic" />
</target>
<target name="smartcn">
<ant dir="smartcn" />
</target>
@ -44,29 +48,33 @@
</target>
<target name="default" depends="compile"/>
<target name="compile" depends="common,icu,smartcn,stempel" />
<target name="compile" depends="common,icu,phonetic,smartcn,stempel" />
<target name="clean">
<ant dir="common" target="clean" />
<ant dir="icu" target="clean" />
<ant dir="phonetic" target="clean" />
<ant dir="smartcn" target="clean" />
<ant dir="stempel" target="clean" />
</target>
<target name="compile-core">
<ant dir="common" target="compile-core" />
<ant dir="icu" target="compile-core" />
<ant dir="phonetic" target="compile-core" />
<ant dir="smartcn" target="compile-core" />
<ant dir="stempel" target="compile-core" />
</target>
<target name="compile-test">
<ant dir="common" target="compile-test" />
<ant dir="icu" target="compile-test" />
<ant dir="phonetic" target="compile-test" />
<ant dir="smartcn" target="compile-test" />
<ant dir="stempel" target="compile-test" />
</target>
<target name="test">
<ant dir="common" target="test" />
<ant dir="icu" target="test" />
<ant dir="phonetic" target="test" />
<ant dir="smartcn" target="test" />
<ant dir="stempel" target="test" />
</target>
@ -76,6 +84,7 @@
<target name="dist-maven" depends="default">
<ant dir="common" target="dist-maven" />
<ant dir="icu" target="dist-maven" />
<ant dir="phonetic" target="dist-maven" />
<ant dir="smartcn" target="dist-maven" />
<ant dir="stempel" target="dist-maven" />
</target>
@ -83,6 +92,7 @@
<target name="javadocs">
<ant dir="common" target="javadocs" />
<ant dir="icu" target="javadocs" />
<ant dir="phonetic" target="javadocs" />
<ant dir="smartcn" target="javadocs" />
<ant dir="stempel" target="javadocs" />
</target>
@ -90,6 +100,7 @@
<target name="javadocs-index.html">
<ant dir="common" target="javadocs-index.html" />
<ant dir="icu" target="javadocs-index.html" />
<ant dir="phonetic" target="javadocs-index.html" />
<ant dir="smartcn" target="javadocs-index.html" />
<ant dir="stempel" target="javadocs-index.html" />
</target>

View File

@ -0,0 +1,181 @@
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
/**
* A filter to apply normal capitalization rules to Tokens. It will make the first letter
* capital and the rest lower case.
* <p/>
* This filter is particularly useful to build nice looking facet parameters. This filter
* is not appropriate if you intend to use a prefix query.
*/
public final class CapitalizationFilter extends TokenFilter {
public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE;
public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
private final boolean onlyFirstWord;
private final CharArraySet keep;
private final boolean forceFirstLetter;
private final Collection<char[]> okPrefix;
private final int minWordLength;
private final int maxWordCount;
private final int maxTokenLength;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
* Creates a CapitalizationFilter with the default parameters.
* <p>
* Calls {@link #CapitalizationFilter(TokenStream, boolean, CharArraySet, boolean, Collection, int, int, int)
* CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH)}
*/
public CapitalizationFilter(TokenStream in) {
this(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
}
/**
* Creates a CapitalizationFilter with the specified parameters.
* @param in input tokenstream
* @param onlyFirstWord should each word be capitalized or all of the words?
* @param keep a keep word list. Each word that should be kept separated by whitespace.
* @param forceFirstLetter Force the first letter to be capitalized even if it is in the keep list.
* @param okPrefix do not change word capitalization if a word begins with something in this list.
* @param minWordLength how long the word needs to be to get capitalization applied. If the
* minWordLength is 3, "and" > "And" but "or" stays "or".
* @param maxWordCount if the token contains more then maxWordCount words, the capitalization is
* assumed to be correct.
* @param maxTokenLength ???
*/
public CapitalizationFilter(TokenStream in, boolean onlyFirstWord, CharArraySet keep,
boolean forceFirstLetter, Collection<char[]> okPrefix, int minWordLength,
int maxWordCount, int maxTokenLength) {
super(in);
this.onlyFirstWord = onlyFirstWord;
this.keep = keep;
this.forceFirstLetter = forceFirstLetter;
this.okPrefix = okPrefix;
this.minWordLength = minWordLength;
this.maxWordCount = maxWordCount;
this.maxTokenLength = maxTokenLength;
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) return false;
char[] termBuffer = termAtt.buffer();
int termBufferLength = termAtt.length();
char[] backup = null;
if (maxWordCount < DEFAULT_MAX_WORD_COUNT) {
//make a backup in case we exceed the word count
backup = new char[termBufferLength];
System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
}
if (termBufferLength < maxTokenLength) {
int wordCount = 0;
int lastWordStart = 0;
for (int i = 0; i < termBufferLength; i++) {
char c = termBuffer[i];
if (c <= ' ' || c == '.') {
int len = i - lastWordStart;
if (len > 0) {
processWord(termBuffer, lastWordStart, len, wordCount++);
lastWordStart = i + 1;
i++;
}
}
}
// process the last word
if (lastWordStart < termBufferLength) {
processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
}
if (wordCount > maxWordCount) {
termAtt.copyBuffer(backup, 0, termBufferLength);
}
}
return true;
}
private void processWord(char[] buffer, int offset, int length, int wordCount) {
if (length < 1) {
return;
}
if (onlyFirstWord && wordCount > 0) {
for (int i = 0; i < length; i++) {
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
}
return;
}
if (keep != null && keep.contains(buffer, offset, length)) {
if (wordCount == 0 && forceFirstLetter) {
buffer[offset] = Character.toUpperCase(buffer[offset]);
}
return;
}
if (length < minWordLength) {
return;
}
if (okPrefix != null) {
for (char[] prefix : okPrefix) {
if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix
boolean match = true;
for (int i = 0; i < prefix.length; i++) {
if (prefix[i] != buffer[offset + i]) {
match = false;
break;
}
}
if (match == true) {
return;
}
}
}
}
// We know it has at least one character
/*char[] chars = w.toCharArray();
StringBuilder word = new StringBuilder( w.length() );
word.append( Character.toUpperCase( chars[0] ) );*/
buffer[offset] = Character.toUpperCase(buffer[offset]);
for (int i = 1; i < length; i++) {
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
}
//return word.toString();
}
}

View File

@ -0,0 +1,121 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*;
/** Tests {@link CapitalizationFilter} */
public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
public void testCapitalization() throws Exception {
CharArraySet keep = new CharArraySet(TEST_VERSION_CURRENT,
Arrays.asList("and", "the", "it", "BIG"), false);
assertCapitalizesTo("kiTTEN", new String[] { "Kitten" },
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesTo("and", new String[] { "And" },
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesTo("AnD", new String[] { "And" },
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
//first is not forced, but it's not a keep word, either
assertCapitalizesTo("AnD", new String[] { "And" },
true, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesTo("big", new String[] { "Big" },
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesTo("BIG", new String[] { "BIG" },
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesToKeyword("Hello thEre my Name is Ryan", "Hello there my name is ryan",
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
// now each token
assertCapitalizesTo("Hello thEre my Name is Ryan",
new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" },
false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
// now only the long words
assertCapitalizesTo("Hello thEre my Name is Ryan",
new String[] { "Hello", "There", "my", "Name", "is", "Ryan" },
false, keep, true, null, 3, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
// without prefix
assertCapitalizesTo("McKinley",
new String[] { "Mckinley" },
true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
// Now try some prefixes
List<char[]> okPrefix = new ArrayList<char[]>();
okPrefix.add("McK".toCharArray());
assertCapitalizesTo("McKinley",
new String[] { "McKinley" },
true, keep, true, okPrefix, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
// now try some stuff with numbers
assertCapitalizesTo("1st 2nd third",
new String[] { "1st", "2nd", "Third" },
false, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesToKeyword("the The the", "The The the",
false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
}
static void assertCapitalizesTo(Tokenizer tokenizer, String expected[],
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
int maxTokenLength) throws IOException {
CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep,
forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
assertTokenStreamContents(filter, expected);
}
static void assertCapitalizesTo(String input, String expected[],
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
int maxTokenLength) throws IOException {
assertCapitalizesTo(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength,
maxWordCount, maxTokenLength);
}
static void assertCapitalizesToKeyword(String input, String expected,
boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
int maxTokenLength) throws IOException {
assertCapitalizesTo(new KeywordTokenizer(new StringReader(input)),
new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
minWordLength, maxWordCount, maxTokenLength);
}
}

View File

@ -0,0 +1,63 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="analyzers-phonetic" default="default">
<description>
Provides phonetic encoding support via Apache Commons Codec.
</description>
<property name="build.dir" location="../build/phonetic" />
<property name="dist.dir" location="../dist/phonetic" />
<path id="additional.dependencies">
<fileset dir="lib" includes="commons-codec-*.jar"/>
</path>
<pathconvert property="project.classpath"
targetos="unix"
refid="additional.dependencies"
/>
<import file="../../../lucene/contrib/contrib-build.xml"/>
<module-uptodate name="analysis/common" jarfile="../build/common/lucene-analyzers-common-${version}.jar"
property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
<path id="classpath">
<pathelement path="${analyzers-common.jar}"/>
<path refid="base.classpath"/>
</path>
<path id="test.classpath">
<pathelement path="${analyzers-common.jar}"/>
<path refid="classpath"/>
<pathelement location="../../../lucene/build/classes/test/"/>
<pathelement location="../build/common/classes/test/"/>
<path refid="junit-path"/>
<pathelement location="${build.dir}/classes/java"/>
</path>
<target name="compile-core" depends="build-analyzers-common, common.compile-core" />
<target name="build-analyzers-common" unless="analyzers-common.uptodate">
<echo>phonetic building dependency ${analyzers-common.jar}</echo>
<ant antfile="../common/build.xml" target="default" inheritall="false" dir="../common" />
</target>
</project>

View File

@ -0,0 +1,2 @@
AnyObjectId[458d432da88b0efeab640c229903fb5aad274044] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,46 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-contrib</artifactId>
<version>@version@</version>
</parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-phonetic</artifactId>
<name>
Lucene Phonetic Filters
</name>
<version>@version@</version>
<description>
Provides phonetic encoding via Commons Codec.
</description>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>codec</artifactId>
<version>${codec-version}</version>
</dependency>
</dependencies>
</project>

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.phonetic;
import java.io.IOException;
import java.util.LinkedList;
@ -35,7 +35,7 @@ public final class DoubleMetaphoneFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
super(input);
this.encoder.setMaxCodeLen(maxCodeLength);
this.inject = inject;

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.phonetic;
import org.apache.commons.codec.Encoder;
import org.apache.lucene.analysis.TokenFilter;
@ -28,23 +28,19 @@ import java.io.IOException;
/**
* Create tokens for phonetic matches. See:
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
*
* @version $Id$
*/
public final class PhoneticFilter extends TokenFilter
{
protected boolean inject = true;
protected Encoder encoder = null;
protected String name = null;
protected State save = null;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
public PhoneticFilter(TokenStream in, Encoder encoder, boolean inject) {
super(in);
this.encoder = encoder;
this.name = name;
this.inject = inject;
}

View File

@ -14,52 +14,53 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.phonetic;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
public void testSize4FalseInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
assertTokenStreamContents(filter, new String[] { "ANTR" });
}
public void testSize4TrueInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
}
public void testAlternateInjectFalse() throws Exception {
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski"));
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Kuczewski"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
}
public void testSize8FalseInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
}
public void testNonConvertableStringsWithInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
}
public void testNonConvertableStringsWithoutInject() throws Exception {
TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
// should have something after the stream
stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello"));
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello"));
filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
}

View File

@ -0,0 +1,73 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.phonetic;
import java.io.StringReader;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.Caverphone;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Tests {@link PhoneticFilter}
*/
public class TestPhoneticFilter extends BaseTokenStreamTestCase {
public void testAlgorithms() throws Exception {
assertAlgorithm(new Metaphone(), true, "aaa bbb ccc easgasg",
new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" });
assertAlgorithm(new Metaphone(), false, "aaa bbb ccc easgasg",
new String[] { "A", "B", "KKK", "ESKS" });
assertAlgorithm(new DoubleMetaphone(), true, "aaa bbb ccc easgasg",
new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" });
assertAlgorithm(new DoubleMetaphone(), false, "aaa bbb ccc easgasg",
new String[] { "A", "PP", "KK", "ASKS" });
assertAlgorithm(new Soundex(), true, "aaa bbb ccc easgasg",
new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" });
assertAlgorithm(new Soundex(), false, "aaa bbb ccc easgasg",
new String[] { "A000", "B000", "C000", "E220" });
assertAlgorithm(new RefinedSoundex(), true, "aaa bbb ccc easgasg",
new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" });
assertAlgorithm(new RefinedSoundex(), false, "aaa bbb ccc easgasg",
new String[] { "A0", "B1", "C3", "E034034" });
assertAlgorithm(new Caverphone(), true, "Darda Karleen Datha Carlene",
new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen",
"TTA1111111", "Datha", "KLN1111111", "Carlene" });
assertAlgorithm(new Caverphone(), false, "Darda Karleen Datha Carlene",
new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" });
}
static void assertAlgorithm(Encoder encoder, boolean inject, String input,
String[] expected) throws Exception {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader(input));
PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject);
assertTokenStreamContents(filter, expected);
}
}

View File

@ -147,6 +147,7 @@
<path id="lucene.classpath">
<pathelement location="${common-solr.dir}/../lucene/build/classes/java" />
<pathelement location="${common-solr.dir}/../modules/analysis/build/common/classes/java" />
<pathelement location="${common-solr.dir}/../modules/analysis/build/phonetic/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/highlighter/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/memory/classes/java" />
<pathelement location="${common-solr.dir}/../lucene/build/contrib/misc/classes/java" />
@ -162,6 +163,7 @@
</subant>
<subant target="jar" inheritall="false" failonerror="true">
<fileset dir="../modules/analysis/common" includes="build.xml" />
<fileset dir="../modules/analysis/phonetic" includes="build.xml" />
<fileset dir="../lucene/contrib/highlighter" includes="build.xml" />
<fileset dir="../lucene/contrib/memory" includes="build.xml" />
<fileset dir="../lucene/contrib/misc" includes="build.xml" />
@ -181,6 +183,9 @@
<fileset dir="../modules/analysis/build/common">
<include name="lucene-analyzers-common-${version}.jar" />
</fileset>
<fileset dir="../modules/analysis/build/phonetic">
<include name="lucene-analyzers-phonetic-${version}.jar" />
</fileset>
<fileset dir="../lucene/build/contrib/highlighter">
<include name="lucene-highlighter-${version}.jar" />
</fileset>
@ -206,6 +211,7 @@
<property name="lucene-compiled" value="true"/>
<subant target="default">
<fileset dir="../modules/analysis/common" includes="build.xml"/>
<fileset dir="../modules/analysis/phonetic" includes="build.xml"/>
<fileset dir="../lucene/contrib/highlighter" includes="build.xml"/>
<fileset dir="../lucene/contrib/memory" includes="build.xml"/>
<fileset dir="../lucene/contrib/misc" includes="build.xml"/>

View File

@ -17,11 +17,10 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
@ -29,11 +28,7 @@ import java.util.Map;
import java.util.StringTokenizer;
/**
* A filter to apply normal capitalization rules to Tokens. It will make the first letter
* capital and the rest lower case.
* <p/>
* This filter is particularly useful to build nice looking facet parameters. This filter
* is not appropriate if you intend to use a prefix query.
* Factory for {@link CapitalizationFilter}.
* <p/>
* The factory takes parameters:<br/>
* "onlyFirstWord" - should each word be capitalized or all of the words?<br/>
@ -52,7 +47,6 @@ import java.util.StringTokenizer;
* @since solr 1.3
*/
public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE;
public static final String KEEP = "keep";
public static final String KEEP_IGNORE_CASE = "keepIgnoreCase";
public static final String OK_PREFIX = "okPrefix";
@ -68,8 +62,8 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
Collection<char[]> okPrefix = Collections.emptyList(); // for Example: McK
int minWordLength = 0; // don't modify capitalization for words shorter then this
int maxWordCount = DEFAULT_MAX_WORD_COUNT;
int maxTokenLength = DEFAULT_MAX_WORD_COUNT;
int maxWordCount = CapitalizationFilter.DEFAULT_MAX_WORD_COUNT;
int maxTokenLength = CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH;
boolean onlyFirstWord = true;
boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is in the keep list
@ -128,116 +122,8 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
}
}
public void processWord(char[] buffer, int offset, int length, int wordCount) {
if (length < 1) {
return;
}
if (onlyFirstWord && wordCount > 0) {
for (int i = 0; i < length; i++) {
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
}
return;
}
if (keep != null && keep.contains(buffer, offset, length)) {
if (wordCount == 0 && forceFirstLetter) {
buffer[offset] = Character.toUpperCase(buffer[offset]);
}
return;
}
if (length < minWordLength) {
return;
}
for (char[] prefix : okPrefix) {
if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix
boolean match = true;
for (int i = 0; i < prefix.length; i++) {
if (prefix[i] != buffer[offset + i]) {
match = false;
break;
}
}
if (match == true) {
return;
}
}
}
// We know it has at least one character
/*char[] chars = w.toCharArray();
StringBuilder word = new StringBuilder( w.length() );
word.append( Character.toUpperCase( chars[0] ) );*/
buffer[offset] = Character.toUpperCase(buffer[offset]);
for (int i = 1; i < length; i++) {
buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
}
//return word.toString();
}
public CapitalizationFilter create(TokenStream input) {
return new CapitalizationFilter(input, this);
return new CapitalizationFilter(input, onlyFirstWord, keep,
forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
}
}
/**
* This relies on the Factory so that the difficult stuff does not need to be
* re-initialized each time the filter runs.
* <p/>
* This is package protected since it is not useful without the Factory
*/
final class CapitalizationFilter extends TokenFilter {
private final CapitalizationFilterFactory factory;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
super(in);
this.factory = factory;
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) return false;
char[] termBuffer = termAtt.buffer();
int termBufferLength = termAtt.length();
char[] backup = null;
if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) {
//make a backup in case we exceed the word count
backup = new char[termBufferLength];
System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
}
if (termBufferLength < factory.maxTokenLength) {
int wordCount = 0;
int lastWordStart = 0;
for (int i = 0; i < termBufferLength; i++) {
char c = termBuffer[i];
if (c <= ' ' || c == '.') {
int len = i - lastWordStart;
if (len > 0) {
factory.processWord(termBuffer, lastWordStart, len, wordCount++);
lastWordStart = i + 1;
i++;
}
}
}
// process the last word
if (lastWordStart < termBufferLength) {
factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
}
if (wordCount > factory.maxWordCount) {
termAtt.copyBuffer(backup, 0, termBufferLength);
}
}
return true;
}
}

View File

@ -19,6 +19,7 @@ package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory
{

View File

@ -29,6 +29,7 @@ import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.StrUtils;
@ -96,6 +97,6 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
}
public PhoneticFilter create(TokenStream input) {
return new PhoneticFilter(input,encoder,name,inject);
return new PhoneticFilter(input,encoder,inject);
}
}

View File

@ -22,6 +22,7 @@ import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {

View File

@ -30,7 +30,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
*
*/
public class TestCapitalizationFilter extends BaseTokenTestCase {
public class TestCapitalizationFilterFactory extends BaseTokenTestCase {
public void testCapitalization() throws Exception
{
@ -40,74 +40,78 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init( args );
char[] termBuffer;
termBuffer = "kiTTEN".toCharArray();
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kiTTEN"))),
new String[] { "Kitten" });
factory.forceFirstLetter = true;
termBuffer = "and".toCharArray();
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("and"))),
new String[] { "And" });
termBuffer = "AnD".toCharArray();
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced, but it's not a keep word, either
//first is forced, but it's not a keep word, either
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))),
new String[] { "And" });
factory.forceFirstLetter = false;
termBuffer = "AnD".toCharArray();
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "And", new String(termBuffer, 0, termBuffer.length)); //first is not forced, but it's not a keep word, either
//first is not forced, but it's not a keep word, either
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))),
new String[] { "And" });
factory.forceFirstLetter = true;
termBuffer = "big".toCharArray();
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "Big", new String(termBuffer, 0, termBuffer.length));
termBuffer = "BIG".toCharArray();
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length));
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" });
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("big"))),
new String[] { "Big" });
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("BIG"))),
new String[] { "BIG" });
assertTokenStreamContents(factory.create(
new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"))),
new String[] { "Hello there my name is ryan" });
// now each token
factory.onlyFirstWord = false;
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))),
new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
// now only the long words
factory.minWordLength = 3;
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))),
new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
// without prefix
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Mckinley" });
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))),
new String[] { "Mckinley" });
// Now try some prefixes
factory = new CapitalizationFilterFactory();
args.put( "okPrefix", "McK" ); // all words
factory.init( args );
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "McKinley" });
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))),
new String[] { "McKinley" });
// now try some stuff with numbers
factory.forceFirstLetter = false;
factory.onlyFirstWord = false;
tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
assertTokenStreamContents(factory.create(
new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third"))),
new String[] { "1st", "2nd", "Third" });
factory.forceFirstLetter = true;
tokenizer = new KeywordTokenizer(new StringReader("the The the" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "The The the" });
assertTokenStreamContents(factory.create(
new KeywordTokenizer(new StringReader("the The the"))),
new String[] { "The The the" });
}
public void testKeepIgnoreCase() throws Exception {
@ -118,21 +122,20 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init( args );
char[] termBuffer;
termBuffer = "kiTTEN".toCharArray();
factory.forceFirstLetter = true;
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "KiTTEN", new String(termBuffer, 0, termBuffer.length));
assertTokenStreamContents(factory.create(
new KeywordTokenizer(new StringReader("kiTTEN"))),
new String[] { "KiTTEN" });
factory.forceFirstLetter = false;
termBuffer = "kiTTEN".toCharArray();
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "kiTTEN", new String(termBuffer, 0, termBuffer.length));
assertTokenStreamContents(factory.create(
new KeywordTokenizer(new StringReader("kiTTEN"))),
new String[] { "kiTTEN" });
factory.keep = null;
termBuffer = "kiTTEN".toCharArray();
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
assertTokenStreamContents(factory.create(
new KeywordTokenizer(new StringReader("kiTTEN"))),
new String[] { "Kitten" });
}
/**

View File

@ -30,7 +30,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* @version $Id$
*/
public class TestPhoneticFilter extends BaseTokenTestCase {
public class TestPhoneticFilterFactory extends BaseTokenTestCase {
public void testFactory()
{