diff --git a/modules/analysis/CHANGES.txt b/modules/analysis/CHANGES.txt index 8ba174d695d..fcbba83b34f 100644 --- a/modules/analysis/CHANGES.txt +++ b/modules/analysis/CHANGES.txt @@ -27,11 +27,14 @@ New Features with text contained in the required words (inverse of StopFilter). - o.a.l.analysis.miscellaneous.HyphenatedWordsFilter: A TokenFilter that puts hyphenated words broken into two lines back together. + - o.a.l.analysis.miscellaneous.CapitalizationFilter: A TokenFilter that applies + capitalization rules to tokens. - o.a.l.analysis.pattern: Package for pattern-based analysis, containing a CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes. - o.a.l.analysis.synonym.SynonymFilter: A synonym filter that supports multi-word synonyms. - (... in progress) + - o.a.l.analysis.phonetic: Package for phonetic search, containing various + phonetic encoders such as Double Metaphone. * LUCENE-2413: Consolidated all Lucene analyzers into common. - o.a.l.analysis.KeywordAnalyzer -> o.a.l.analysis.core.KeywordAnalyzer @@ -60,7 +63,6 @@ New Features - o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase - o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase - o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader - ... (in progress) Build diff --git a/modules/analysis/NOTICE.txt b/modules/analysis/NOTICE.txt index 8b13cc06746..6abde9313c7 100644 --- a/modules/analysis/NOTICE.txt +++ b/modules/analysis/NOTICE.txt @@ -4,6 +4,10 @@ Copyright 2006 The Apache Software Foundation This product includes software developed by The Apache Software Foundation (http://www.apache.org/). +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Commons + The snowball stemmers in common/src/java/net/sf/snowball were developed by Martin Porter and Richard Boulton. diff --git a/modules/analysis/README.txt b/modules/analysis/README.txt index 53d3c34df90..85b4e9392c3 100644 --- a/modules/analysis/README.txt +++ b/modules/analysis/README.txt @@ -20,7 +20,12 @@ lucene-analyzers-common-XX.jar lucene-analyzers-icu-XX.jar An add-on analysis library that provides improved Unicode support via International Components for Unicode (ICU). Note: this module depends on - the ICU4j jar file (version > 4.4.0) + the ICU4j jar file (version >= 4.4.0) + +lucene-analyzers-phonetic-XX.jar + An add-on analysis library that provides phonetic encoders via Apache + Commons-Codec. Note: this module depends on the commons-codec jar + file (version >= 1.4) lucene-analyzers-smartcn-XX.jar An add-on analysis library that provides word segmentation for Simplified @@ -32,12 +37,14 @@ lucene-analyzers-stempel-XX.jar common/src/java icu/src/java +phonetic/src/java smartcn/src/java stempel/src/java - The source code for the four libraries. + The source code for the ffve libraries. common/src/test icu/src/test +phonetic/src/test smartcn/src/test stempel/src/test - Unit tests for the four libraries. + Unit tests for the five libraries. diff --git a/modules/analysis/build.xml b/modules/analysis/build.xml index 750cfa90a26..599442d153f 100644 --- a/modules/analysis/build.xml +++ b/modules/analysis/build.xml @@ -35,6 +35,10 @@ + + + + @@ -44,29 +48,33 @@ - + + + + + @@ -76,6 +84,7 @@ + @@ -83,6 +92,7 @@ + @@ -90,6 +100,7 @@ + diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java new file mode 100644 index 00000000000..a41314ed891 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java @@ -0,0 +1,181 @@ +package org.apache.lucene.analysis.miscellaneous; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.CharArraySet; + +/** + * A filter to apply normal capitalization rules to Tokens. It will make the first letter + * capital and the rest lower case. + *

+ * This filter is particularly useful to build nice looking facet parameters. This filter + * is not appropriate if you intend to use a prefix query. + */ +public final class CapitalizationFilter extends TokenFilter { + public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE; + public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE; + + private final boolean onlyFirstWord; + private final CharArraySet keep; + private final boolean forceFirstLetter; + private final Collection okPrefix; + + private final int minWordLength; + private final int maxWordCount; + private final int maxTokenLength; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + /** + * Creates a CapitalizationFilter with the default parameters. + *

+ * Calls {@link #CapitalizationFilter(TokenStream, boolean, CharArraySet, boolean, Collection, int, int, int) + * CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH)} + */ + public CapitalizationFilter(TokenStream in) { + this(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + } + + /** + * Creates a CapitalizationFilter with the specified parameters. + * @param in input tokenstream + * @param onlyFirstWord should each word be capitalized or all of the words? + * @param keep a keep word list. Each word that should be kept separated by whitespace. + * @param forceFirstLetter Force the first letter to be capitalized even if it is in the keep list. + * @param okPrefix do not change word capitalization if a word begins with something in this list. + * @param minWordLength how long the word needs to be to get capitalization applied. If the + * minWordLength is 3, "and" > "And" but "or" stays "or". + * @param maxWordCount if the token contains more then maxWordCount words, the capitalization is + * assumed to be correct. + * @param maxTokenLength ??? + */ + public CapitalizationFilter(TokenStream in, boolean onlyFirstWord, CharArraySet keep, + boolean forceFirstLetter, Collection okPrefix, int minWordLength, + int maxWordCount, int maxTokenLength) { + super(in); + this.onlyFirstWord = onlyFirstWord; + this.keep = keep; + this.forceFirstLetter = forceFirstLetter; + this.okPrefix = okPrefix; + this.minWordLength = minWordLength; + this.maxWordCount = maxWordCount; + this.maxTokenLength = maxTokenLength; + } + + @Override + public boolean incrementToken() throws IOException { + if (!input.incrementToken()) return false; + + char[] termBuffer = termAtt.buffer(); + int termBufferLength = termAtt.length(); + char[] backup = null; + + if (maxWordCount < DEFAULT_MAX_WORD_COUNT) { + //make a backup in case we exceed the word count + backup = new char[termBufferLength]; + System.arraycopy(termBuffer, 0, backup, 0, termBufferLength); + } + + if (termBufferLength < maxTokenLength) { + int wordCount = 0; + + int lastWordStart = 0; + for (int i = 0; i < termBufferLength; i++) { + char c = termBuffer[i]; + if (c <= ' ' || c == '.') { + int len = i - lastWordStart; + if (len > 0) { + processWord(termBuffer, lastWordStart, len, wordCount++); + lastWordStart = i + 1; + i++; + } + } + } + + // process the last word + if (lastWordStart < termBufferLength) { + processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); + } + + if (wordCount > maxWordCount) { + termAtt.copyBuffer(backup, 0, termBufferLength); + } + } + + return true; + } + + private void processWord(char[] buffer, int offset, int length, int wordCount) { + if (length < 1) { + return; + } + + if (onlyFirstWord && wordCount > 0) { + for (int i = 0; i < length; i++) { + buffer[offset + i] = Character.toLowerCase(buffer[offset + i]); + + } + return; + } + + if (keep != null && keep.contains(buffer, offset, length)) { + if (wordCount == 0 && forceFirstLetter) { + buffer[offset] = Character.toUpperCase(buffer[offset]); + } + return; + } + + if (length < minWordLength) { + return; + } + + if (okPrefix != null) { + for (char[] prefix : okPrefix) { + if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix + boolean match = true; + for (int i = 0; i < prefix.length; i++) { + if (prefix[i] != buffer[offset + i]) { + match = false; + break; + } + } + if (match == true) { + return; + } + } + } + } + + // We know it has at least one character + /*char[] chars = w.toCharArray(); + StringBuilder word = new StringBuilder( w.length() ); + word.append( Character.toUpperCase( chars[0] ) );*/ + buffer[offset] = Character.toUpperCase(buffer[offset]); + + for (int i = 1; i < length; i++) { + buffer[offset + i] = Character.toLowerCase(buffer[offset + i]); + } + //return word.toString(); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java new file mode 100644 index 00000000000..4d30d4bacb5 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; + +import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*; + +/** Tests {@link CapitalizationFilter} */ +public class TestCapitalizationFilter extends BaseTokenStreamTestCase { + public void testCapitalization() throws Exception { + CharArraySet keep = new CharArraySet(TEST_VERSION_CURRENT, + Arrays.asList("and", "the", "it", "BIG"), false); + + assertCapitalizesTo("kiTTEN", new String[] { "Kitten" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesTo("and", new String[] { "And" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesTo("AnD", new String[] { "And" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + //first is not forced, but it's not a keep word, either + assertCapitalizesTo("AnD", new String[] { "And" }, + true, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesTo("big", new String[] { "Big" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesTo("BIG", new String[] { "BIG" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesToKeyword("Hello thEre my Name is Ryan", "Hello there my name is ryan", + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + // now each token + assertCapitalizesTo("Hello thEre my Name is Ryan", + new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }, + false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + // now only the long words + assertCapitalizesTo("Hello thEre my Name is Ryan", + new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }, + false, keep, true, null, 3, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + // without prefix + assertCapitalizesTo("McKinley", + new String[] { "Mckinley" }, + true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + // Now try some prefixes + List okPrefix = new ArrayList(); + okPrefix.add("McK".toCharArray()); + + assertCapitalizesTo("McKinley", + new String[] { "McKinley" }, + true, keep, true, okPrefix, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + // now try some stuff with numbers + assertCapitalizesTo("1st 2nd third", + new String[] { "1st", "2nd", "Third" }, + false, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + + assertCapitalizesToKeyword("the The the", "The The the", + false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH); + } + + static void assertCapitalizesTo(Tokenizer tokenizer, String expected[], + boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter, + Collection okPrefix, int minWordLength, int maxWordCount, + int maxTokenLength) throws IOException { + CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep, + forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength); + assertTokenStreamContents(filter, expected); + } + + static void assertCapitalizesTo(String input, String expected[], + boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter, + Collection okPrefix, int minWordLength, int maxWordCount, + int maxTokenLength) throws IOException { + assertCapitalizesTo(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), + expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, + maxWordCount, maxTokenLength); + } + + static void assertCapitalizesToKeyword(String input, String expected, + boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter, + Collection okPrefix, int minWordLength, int maxWordCount, + int maxTokenLength) throws IOException { + assertCapitalizesTo(new KeywordTokenizer(new StringReader(input)), + new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix, + minWordLength, maxWordCount, maxTokenLength); + } +} diff --git a/modules/analysis/phonetic/build.xml b/modules/analysis/phonetic/build.xml new file mode 100644 index 00000000000..9efd18a94b8 --- /dev/null +++ b/modules/analysis/phonetic/build.xml @@ -0,0 +1,63 @@ + + + + + + + + Provides phonetic encoding support via Apache Commons Codec. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + phonetic building dependency ${analyzers-common.jar} + + + diff --git a/modules/analysis/phonetic/lib/commons-codec-1.4.jar b/modules/analysis/phonetic/lib/commons-codec-1.4.jar new file mode 100644 index 00000000000..97a58157492 --- /dev/null +++ b/modules/analysis/phonetic/lib/commons-codec-1.4.jar @@ -0,0 +1,2 @@ +AnyObjectId[458d432da88b0efeab640c229903fb5aad274044] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/modules/analysis/phonetic/pom.xml.template b/modules/analysis/phonetic/pom.xml.template new file mode 100644 index 00000000000..462c4a1073e --- /dev/null +++ b/modules/analysis/phonetic/pom.xml.template @@ -0,0 +1,46 @@ + + + + 4.0.0 + + org.apache.lucene + lucene-contrib + @version@ + + org.apache.lucene + lucene-phonetic + + Lucene Phonetic Filters + + @version@ + + Provides phonetic encoding via Commons Codec. + + jar + + + org.apache.commons + codec + ${codec-version} + + + diff --git a/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java b/modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java similarity index 96% rename from solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java rename to modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java index d384d2c1ece..971c9b4f7a5 100644 --- a/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java +++ b/modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.phonetic; import java.io.IOException; import java.util.LinkedList; @@ -35,7 +35,7 @@ public final class DoubleMetaphoneFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); - protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) { + public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) { super(input); this.encoder.setMaxCodeLen(maxCodeLength); this.inject = inject; diff --git a/solr/src/java/org/apache/solr/analysis/PhoneticFilter.java b/modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java similarity index 93% rename from solr/src/java/org/apache/solr/analysis/PhoneticFilter.java rename to modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java index a6d0a3bbe21..791def825d1 100644 --- a/solr/src/java/org/apache/solr/analysis/PhoneticFilter.java +++ b/modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.phonetic; import org.apache.commons.codec.Encoder; import org.apache.lucene.analysis.TokenFilter; @@ -28,23 +28,19 @@ import java.io.IOException; /** * Create tokens for phonetic matches. See: * http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html - * - * @version $Id$ */ public final class PhoneticFilter extends TokenFilter { protected boolean inject = true; protected Encoder encoder = null; - protected String name = null; protected State save = null; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); - public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) { + public PhoneticFilter(TokenStream in, Encoder encoder, boolean inject) { super(in); this.encoder = encoder; - this.name = name; this.inject = inject; } diff --git a/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java similarity index 72% rename from solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java rename to modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java index 35d03b1378c..e99b9b5f90a 100644 --- a/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java +++ b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java @@ -14,52 +14,53 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.phonetic; import java.io.StringReader; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; -public class DoubleMetaphoneFilterTest extends BaseTokenTestCase { +public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase { public void testSize4FalseInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); assertTokenStreamContents(filter, new String[] { "ANTR" }); } public void testSize4TrueInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true); assertTokenStreamContents(filter, new String[] { "international", "ANTR" }); } public void testAlternateInjectFalse() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Kuczewski")); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" }); } public void testSize8FalseInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); assertTokenStreamContents(filter, new String[] { "ANTRNXNL" }); } public void testNonConvertableStringsWithInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&")); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); } public void testNonConvertableStringsWithoutInject() throws Exception { - TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&")); + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&")); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); // should have something after the stream - stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello")); + stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello")); filter = new DoubleMetaphoneFilter(stream, 8, false); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" }); } diff --git a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java new file mode 100644 index 00000000000..811d1ec1906 --- /dev/null +++ b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.phonetic; + +import java.io.StringReader; + +import org.apache.commons.codec.Encoder; +import org.apache.commons.codec.language.Caverphone; +import org.apache.commons.codec.language.DoubleMetaphone; +import org.apache.commons.codec.language.Metaphone; +import org.apache.commons.codec.language.RefinedSoundex; +import org.apache.commons.codec.language.Soundex; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Tests {@link PhoneticFilter} + */ +public class TestPhoneticFilter extends BaseTokenStreamTestCase { + + public void testAlgorithms() throws Exception { + assertAlgorithm(new Metaphone(), true, "aaa bbb ccc easgasg", + new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" }); + assertAlgorithm(new Metaphone(), false, "aaa bbb ccc easgasg", + new String[] { "A", "B", "KKK", "ESKS" }); + + assertAlgorithm(new DoubleMetaphone(), true, "aaa bbb ccc easgasg", + new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" }); + assertAlgorithm(new DoubleMetaphone(), false, "aaa bbb ccc easgasg", + new String[] { "A", "PP", "KK", "ASKS" }); + + assertAlgorithm(new Soundex(), true, "aaa bbb ccc easgasg", + new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" }); + assertAlgorithm(new Soundex(), false, "aaa bbb ccc easgasg", + new String[] { "A000", "B000", "C000", "E220" }); + + assertAlgorithm(new RefinedSoundex(), true, "aaa bbb ccc easgasg", + new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" }); + assertAlgorithm(new RefinedSoundex(), false, "aaa bbb ccc easgasg", + new String[] { "A0", "B1", "C3", "E034034" }); + + assertAlgorithm(new Caverphone(), true, "Darda Karleen Datha Carlene", + new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen", + "TTA1111111", "Datha", "KLN1111111", "Carlene" }); + assertAlgorithm(new Caverphone(), false, "Darda Karleen Datha Carlene", + new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" }); + } + + + static void assertAlgorithm(Encoder encoder, boolean inject, String input, + String[] expected) throws Exception { + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader(input)); + PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject); + assertTokenStreamContents(filter, expected); + } +} diff --git a/solr/common-build.xml b/solr/common-build.xml index 62b79889a00..805bb769cec 100644 --- a/solr/common-build.xml +++ b/solr/common-build.xml @@ -147,6 +147,7 @@ + @@ -162,6 +163,7 @@ + @@ -181,6 +183,9 @@ + + + @@ -206,6 +211,7 @@ + diff --git a/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java b/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java index b2afbfd4937..07ab89ba6a6 100644 --- a/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java @@ -17,11 +17,10 @@ package org.apache.solr.analysis; -import org.apache.lucene.analysis.*; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter; import org.apache.lucene.analysis.util.CharArraySet; -import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -29,11 +28,7 @@ import java.util.Map; import java.util.StringTokenizer; /** - * A filter to apply normal capitalization rules to Tokens. It will make the first letter - * capital and the rest lower case. - *

- * This filter is particularly useful to build nice looking facet parameters. This filter - * is not appropriate if you intend to use a prefix query. + * Factory for {@link CapitalizationFilter}. *

* The factory takes parameters:
* "onlyFirstWord" - should each word be capitalized or all of the words?
@@ -52,7 +47,6 @@ import java.util.StringTokenizer; * @since solr 1.3 */ public class CapitalizationFilterFactory extends BaseTokenFilterFactory { - public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE; public static final String KEEP = "keep"; public static final String KEEP_IGNORE_CASE = "keepIgnoreCase"; public static final String OK_PREFIX = "okPrefix"; @@ -68,8 +62,8 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory { Collection okPrefix = Collections.emptyList(); // for Example: McK int minWordLength = 0; // don't modify capitalization for words shorter then this - int maxWordCount = DEFAULT_MAX_WORD_COUNT; - int maxTokenLength = DEFAULT_MAX_WORD_COUNT; + int maxWordCount = CapitalizationFilter.DEFAULT_MAX_WORD_COUNT; + int maxTokenLength = CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH; boolean onlyFirstWord = true; boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is in the keep list @@ -128,116 +122,8 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory { } } - - public void processWord(char[] buffer, int offset, int length, int wordCount) { - if (length < 1) { - return; - } - if (onlyFirstWord && wordCount > 0) { - for (int i = 0; i < length; i++) { - buffer[offset + i] = Character.toLowerCase(buffer[offset + i]); - - } - return; - } - - if (keep != null && keep.contains(buffer, offset, length)) { - if (wordCount == 0 && forceFirstLetter) { - buffer[offset] = Character.toUpperCase(buffer[offset]); - } - return; - } - if (length < minWordLength) { - return; - } - for (char[] prefix : okPrefix) { - if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix - boolean match = true; - for (int i = 0; i < prefix.length; i++) { - if (prefix[i] != buffer[offset + i]) { - match = false; - break; - } - } - if (match == true) { - return; - } - } - } - - // We know it has at least one character - /*char[] chars = w.toCharArray(); - StringBuilder word = new StringBuilder( w.length() ); - word.append( Character.toUpperCase( chars[0] ) );*/ - buffer[offset] = Character.toUpperCase(buffer[offset]); - - for (int i = 1; i < length; i++) { - buffer[offset + i] = Character.toLowerCase(buffer[offset + i]); - } - //return word.toString(); - } - public CapitalizationFilter create(TokenStream input) { - return new CapitalizationFilter(input, this); + return new CapitalizationFilter(input, onlyFirstWord, keep, + forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength); } } - - -/** - * This relies on the Factory so that the difficult stuff does not need to be - * re-initialized each time the filter runs. - *

- * This is package protected since it is not useful without the Factory - */ -final class CapitalizationFilter extends TokenFilter { - private final CapitalizationFilterFactory factory; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - - public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) { - super(in); - this.factory = factory; - } - - @Override - public boolean incrementToken() throws IOException { - if (!input.incrementToken()) return false; - - char[] termBuffer = termAtt.buffer(); - int termBufferLength = termAtt.length(); - char[] backup = null; - if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) { - //make a backup in case we exceed the word count - backup = new char[termBufferLength]; - System.arraycopy(termBuffer, 0, backup, 0, termBufferLength); - } - if (termBufferLength < factory.maxTokenLength) { - int wordCount = 0; - - int lastWordStart = 0; - for (int i = 0; i < termBufferLength; i++) { - char c = termBuffer[i]; - if (c <= ' ' || c == '.') { - int len = i - lastWordStart; - if (len > 0) { - factory.processWord(termBuffer, lastWordStart, len, wordCount++); - lastWordStart = i + 1; - i++; - } - } - } - - // process the last word - if (lastWordStart < termBufferLength) { - factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); - } - - if (wordCount > factory.maxWordCount) { - termAtt.copyBuffer(backup, 0, termBufferLength); - } - } - - return true; - } - -} - diff --git a/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java b/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java index d7ec11ec8ea..bb72143c56c 100644 --- a/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java @@ -19,6 +19,7 @@ package org.apache.solr.analysis; import java.util.Map; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory { diff --git a/solr/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java b/solr/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java index 3872417d0ae..b53b9f35841 100644 --- a/solr/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java @@ -29,6 +29,7 @@ import org.apache.commons.codec.language.Metaphone; import org.apache.commons.codec.language.RefinedSoundex; import org.apache.commons.codec.language.Soundex; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.phonetic.PhoneticFilter; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.StrUtils; @@ -96,6 +97,6 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory } public PhoneticFilter create(TokenStream input) { - return new PhoneticFilter(input,encoder,name,inject); + return new PhoneticFilter(input,encoder,inject); } } diff --git a/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java b/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java index c61c827ca02..5c155d78317 100644 --- a/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java +++ b/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java @@ -22,6 +22,7 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase { diff --git a/solr/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java b/solr/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java similarity index 64% rename from solr/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java rename to solr/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java index 2b1bd10e035..343754bd565 100644 --- a/solr/src/test/org/apache/solr/analysis/TestCapitalizationFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java @@ -30,7 +30,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer; /** * */ -public class TestCapitalizationFilter extends BaseTokenTestCase { +public class TestCapitalizationFilterFactory extends BaseTokenTestCase { public void testCapitalization() throws Exception { @@ -40,74 +40,78 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); factory.init( args ); - char[] termBuffer; - termBuffer = "kiTTEN".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length)); - + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kiTTEN"))), + new String[] { "Kitten" }); + factory.forceFirstLetter = true; - termBuffer = "and".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("and"))), + new String[] { "And" }); - termBuffer = "AnD".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "And", new String(termBuffer, 0, termBuffer.length));//first is forced, but it's not a keep word, either + //first is forced, but it's not a keep word, either + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))), + new String[] { "And" }); factory.forceFirstLetter = false; - termBuffer = "AnD".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "And", new String(termBuffer, 0, termBuffer.length)); //first is not forced, but it's not a keep word, either + + //first is not forced, but it's not a keep word, either + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))), + new String[] { "And" }); factory.forceFirstLetter = true; - termBuffer = "big".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "Big", new String(termBuffer, 0, termBuffer.length)); - termBuffer = "BIG".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length)); - Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan")); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("big"))), + new String[] { "Big" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("BIG"))), + new String[] { "BIG" }); + + assertTokenStreamContents(factory.create( + new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"))), + new String[] { "Hello there my name is ryan" }); + // now each token factory.onlyFirstWord = false; - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan")); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))), + new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }); // now only the long words factory.minWordLength = 3; - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))), + new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }); // without prefix - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Mckinley" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))), + new String[] { "Mckinley" }); // Now try some prefixes factory = new CapitalizationFilterFactory(); args.put( "okPrefix", "McK" ); // all words factory.init( args ); - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "McKinley" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))), + new String[] { "McKinley" }); // now try some stuff with numbers factory.forceFirstLetter = false; factory.onlyFirstWord = false; - tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" }); + assertTokenStreamContents(factory.create( + new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third"))), + new String[] { "1st", "2nd", "Third" }); - factory.forceFirstLetter = true; - tokenizer = new KeywordTokenizer(new StringReader("the The the" )); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "The The the" }); + factory.forceFirstLetter = true; + assertTokenStreamContents(factory.create( + new KeywordTokenizer(new StringReader("the The the"))), + new String[] { "The The the" }); } public void testKeepIgnoreCase() throws Exception { @@ -118,21 +122,20 @@ public class TestCapitalizationFilter extends BaseTokenTestCase { CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); factory.init( args ); - char[] termBuffer; - termBuffer = "kiTTEN".toCharArray(); factory.forceFirstLetter = true; - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "KiTTEN", new String(termBuffer, 0, termBuffer.length)); + assertTokenStreamContents(factory.create( + new KeywordTokenizer(new StringReader("kiTTEN"))), + new String[] { "KiTTEN" }); factory.forceFirstLetter = false; - termBuffer = "kiTTEN".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "kiTTEN", new String(termBuffer, 0, termBuffer.length)); + assertTokenStreamContents(factory.create( + new KeywordTokenizer(new StringReader("kiTTEN"))), + new String[] { "kiTTEN" }); factory.keep = null; - termBuffer = "kiTTEN".toCharArray(); - factory.processWord(termBuffer, 0, termBuffer.length, 0 ); - assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length)); + assertTokenStreamContents(factory.create( + new KeywordTokenizer(new StringReader("kiTTEN"))), + new String[] { "Kitten" }); } /** diff --git a/solr/src/test/org/apache/solr/analysis/TestPhoneticFilter.java b/solr/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java similarity index 98% rename from solr/src/test/org/apache/solr/analysis/TestPhoneticFilter.java rename to solr/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java index c2875beb38a..f9f8cca3f2a 100644 --- a/solr/src/test/org/apache/solr/analysis/TestPhoneticFilter.java +++ b/solr/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java @@ -30,7 +30,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer; /** * @version $Id$ */ -public class TestPhoneticFilter extends BaseTokenTestCase { +public class TestPhoneticFilterFactory extends BaseTokenTestCase { public void testFactory() {