mirror of https://github.com/apache/lucene.git
LUCENE-2413: consolidate commongrams into contrib/analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940761 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
435cc41758
commit
1d1331fa03
|
@ -155,6 +155,12 @@ New features
|
||||||
of AttributeSource.cloneAttributes() instances and the new copyTo() method.
|
of AttributeSource.cloneAttributes() instances and the new copyTo() method.
|
||||||
(Steven Rowe via Uwe Schindler)
|
(Steven Rowe via Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-2413: Consolidated Solr analysis components into contrib/analyzers.
|
||||||
|
New features from Solr now available to Lucene users include:
|
||||||
|
- o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
|
||||||
|
and phrases.
|
||||||
|
(... in progress)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
* See the License for the specific language governing permissions and limitations under the License.
|
* See the License for the specific language governing permissions and limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
package org.apache.lucene.analysis.commongrams;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.analysis;
|
package org.apache.lucene.analysis.commongrams;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
|
import static org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single
|
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single
|
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Construct n-grams for frequently occurring terms and phrases.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -14,28 +14,29 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.analysis;
|
package org.apache.lucene.analysis.commongrams;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests CommonGramsQueryFilter
|
* Tests CommonGrams(Query)Filter
|
||||||
*/
|
*/
|
||||||
public class CommonGramsFilterTest extends BaseTokenTestCase {
|
public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
||||||
private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
|
private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
|
||||||
"of" };
|
"of" };
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
final String input = "How the s a brown s cow d like A B thing?";
|
final String input = "How the s a brown s cow d like A B thing?";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
|
|
||||||
CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
|
CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
|
||||||
|
@ -56,7 +57,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
|
|
||||||
public void testQueryReset() throws Exception {
|
public void testQueryReset() throws Exception {
|
||||||
final String input = "How the s a brown s cow d like A B thing?";
|
final String input = "How the s a brown s cow d like A B thing?";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
|
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
|
|
||||||
|
@ -88,7 +89,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream tokenStream(String field, Reader in) {
|
public TokenStream tokenStream(String field, Reader in) {
|
||||||
return new CommonGramsQueryFilter(new CommonGramsFilter(
|
return new CommonGramsQueryFilter(new CommonGramsFilter(
|
||||||
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords));
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -157,7 +158,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream tokenStream(String field, Reader in) {
|
public TokenStream tokenStream(String field, Reader in) {
|
||||||
return new CommonGramsFilter(
|
return new CommonGramsFilter(
|
||||||
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords);
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -243,7 +244,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testCaseSensitive() throws Exception {
|
public void testCaseSensitive() throws Exception {
|
||||||
final String input = "How The s a brown s cow d like A B thing?";
|
final String input = "How The s a brown s cow d like A B thing?";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||||
Set common = CommonGramsFilter.makeCommonSet(commonWords);
|
Set common = CommonGramsFilter.makeCommonSet(commonWords);
|
||||||
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
|
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
|
||||||
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
|
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
|
||||||
|
@ -256,7 +257,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testLastWordisStopWord() throws Exception {
|
public void testLastWordisStopWord() throws Exception {
|
||||||
final String input = "dog the";
|
final String input = "dog the";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
assertTokenStreamContents(nsf, new String[] { "dog_the" });
|
assertTokenStreamContents(nsf, new String[] { "dog_the" });
|
||||||
|
@ -267,7 +268,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testFirstWordisStopWord() throws Exception {
|
public void testFirstWordisStopWord() throws Exception {
|
||||||
final String input = "the dog";
|
final String input = "the dog";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
assertTokenStreamContents(nsf, new String[] { "the_dog" });
|
assertTokenStreamContents(nsf, new String[] { "the_dog" });
|
||||||
|
@ -278,7 +279,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testOneWordQueryStopWord() throws Exception {
|
public void testOneWordQueryStopWord() throws Exception {
|
||||||
final String input = "the";
|
final String input = "the";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
assertTokenStreamContents(nsf, new String[] { "the" });
|
assertTokenStreamContents(nsf, new String[] { "the" });
|
||||||
|
@ -289,7 +290,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testOneWordQuery() throws Exception {
|
public void testOneWordQuery() throws Exception {
|
||||||
final String input = "monster";
|
final String input = "monster";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
assertTokenStreamContents(nsf, new String[] { "monster" });
|
assertTokenStreamContents(nsf, new String[] { "monster" });
|
||||||
|
@ -300,7 +301,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void TestFirstAndLastStopWord() throws Exception {
|
public void TestFirstAndLastStopWord() throws Exception {
|
||||||
final String input = "the of";
|
final String input = "the of";
|
||||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||||
assertTokenStreamContents(nsf, new String[] { "the_of" });
|
assertTokenStreamContents(nsf, new String[] { "the_of" });
|
|
@ -22,6 +22,7 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,8 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||||
|
import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue