LUCENE-2413: consolidate commongrams into contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940761 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-04 07:50:41 +00:00
parent 435cc41758
commit 1d1331fa03
7 changed files with 48 additions and 16 deletions

View File

@ -155,6 +155,12 @@ New features
of AttributeSource.cloneAttributes() instances and the new copyTo() method. of AttributeSource.cloneAttributes() instances and the new copyTo() method.
(Steven Rowe via Uwe Schindler) (Steven Rowe via Uwe Schindler)
* LUCENE-2413: Consolidated Solr analysis components into contrib/analyzers.
New features from Solr now available to Lucene users include:
- o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
and phrases.
(... in progress)
Build Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation * LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -7,7 +7,7 @@
* See the License for the specific language governing permissions and limitations under the License. * See the License for the specific language governing permissions and limitations under the License.
*/ */
package org.apache.solr.analysis; package org.apache.lucene.analysis.commongrams;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.solr.analysis; package org.apache.lucene.analysis.commongrams;
import java.io.IOException; import java.io.IOException;
@ -22,7 +22,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE; import static org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;
/** /**
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single * Wrap a CommonGramsFilter optimizing phrase queries by only returning single

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Construct n-grams for frequently occurring terms and phrases.
</body>
</html>

View File

@ -14,28 +14,29 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.solr.analysis; package org.apache.lucene.analysis.commongrams;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/** /**
* Tests CommonGramsQueryFilter * Tests CommonGrams(Query)Filter
*/ */
public class CommonGramsFilterTest extends BaseTokenTestCase { public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
private static final String[] commonWords = { "s", "a", "b", "c", "d", "the", private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
"of" }; "of" };
public void testReset() throws Exception { public void testReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?"; final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class); CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
@ -56,7 +57,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
public void testQueryReset() throws Exception { public void testQueryReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?"; final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
@ -88,7 +89,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
@Override @Override
public TokenStream tokenStream(String field, Reader in) { public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsQueryFilter(new CommonGramsFilter( return new CommonGramsQueryFilter(new CommonGramsFilter(
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords)); new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords));
} }
}; };
@ -157,7 +158,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
@Override @Override
public TokenStream tokenStream(String field, Reader in) { public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsFilter( return new CommonGramsFilter(
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords); new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords);
} }
}; };
@ -243,7 +244,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void testCaseSensitive() throws Exception { public void testCaseSensitive() throws Exception {
final String input = "How The s a brown s cow d like A B thing?"; final String input = "How The s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
Set common = CommonGramsFilter.makeCommonSet(commonWords); Set common = CommonGramsFilter.makeCommonSet(commonWords);
TokenFilter cgf = new CommonGramsFilter(wt, common, false); TokenFilter cgf = new CommonGramsFilter(wt, common, false);
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s", assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
@ -256,7 +257,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void testLastWordisStopWord() throws Exception { public void testLastWordisStopWord() throws Exception {
final String input = "dog the"; final String input = "dog the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "dog_the" }); assertTokenStreamContents(nsf, new String[] { "dog_the" });
@ -267,7 +268,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void testFirstWordisStopWord() throws Exception { public void testFirstWordisStopWord() throws Exception {
final String input = "the dog"; final String input = "the dog";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_dog" }); assertTokenStreamContents(nsf, new String[] { "the_dog" });
@ -278,7 +279,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void testOneWordQueryStopWord() throws Exception { public void testOneWordQueryStopWord() throws Exception {
final String input = "the"; final String input = "the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the" }); assertTokenStreamContents(nsf, new String[] { "the" });
@ -289,7 +290,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void testOneWordQuery() throws Exception { public void testOneWordQuery() throws Exception {
final String input = "monster"; final String input = "monster";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "monster" }); assertTokenStreamContents(nsf, new String[] { "monster" });
@ -300,7 +301,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/ */
public void TestFirstAndLastStopWord() throws Exception { public void TestFirstAndLastStopWord() throws Exception {
final String input = "the of"; final String input = "the of";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_of" }); assertTokenStreamContents(nsf, new String[] { "the_of" });

View File

@ -22,6 +22,7 @@ import java.util.Set;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;

View File

@ -23,6 +23,8 @@ import java.util.Set;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware; import org.apache.solr.util.plugin.ResourceLoaderAware;