LUCENE-2413: consolidate commongrams into contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940761 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-04 07:50:41 +00:00
parent 435cc41758
commit 1d1331fa03
7 changed files with 48 additions and 16 deletions

View File

@ -155,6 +155,12 @@ New features
of AttributeSource.cloneAttributes() instances and the new copyTo() method.
(Steven Rowe via Uwe Schindler)
* LUCENE-2413: Consolidated Solr analysis components into contrib/analyzers.
New features from Solr now available to Lucene users include:
- o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
and phrases.
(... in progress)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -7,7 +7,7 @@
* See the License for the specific language governing permissions and limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.commongrams;
import java.io.IOException;
import java.util.Arrays;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.commongrams;
import java.io.IOException;
@ -22,7 +22,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
import static org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;
/**
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Construct n-grams for frequently occurring terms and phrases.
</body>
</html>

View File

@ -14,28 +14,29 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.commongrams;
import java.io.Reader;
import java.io.StringReader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* Tests CommonGramsQueryFilter
* Tests CommonGrams(Query)Filter
*/
public class CommonGramsFilterTest extends BaseTokenTestCase {
public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
"of" };
public void testReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
@ -56,7 +57,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
public void testQueryReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
@ -88,7 +89,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
@Override
public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsQueryFilter(new CommonGramsFilter(
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords));
new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords));
}
};
@ -157,7 +158,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
@Override
public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsFilter(
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords);
new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords);
}
};
@ -243,7 +244,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void testCaseSensitive() throws Exception {
final String input = "How The s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
Set common = CommonGramsFilter.makeCommonSet(commonWords);
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
@ -256,7 +257,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void testLastWordisStopWord() throws Exception {
final String input = "dog the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "dog_the" });
@ -267,7 +268,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void testFirstWordisStopWord() throws Exception {
final String input = "the dog";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_dog" });
@ -278,7 +279,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void testOneWordQueryStopWord() throws Exception {
final String input = "the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the" });
@ -289,7 +290,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void testOneWordQuery() throws Exception {
final String input = "monster";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "monster" });
@ -300,7 +301,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
*/
public void TestFirstAndLastStopWord() throws Exception {
final String input = "the of";
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_of" });

View File

@ -22,6 +22,7 @@ import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware;

View File

@ -23,6 +23,8 @@ import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware;