mirror of https://github.com/apache/lucene.git
LUCENE-2413: consolidate commongrams into contrib/analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940761 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
435cc41758
commit
1d1331fa03
|
@ -155,6 +155,12 @@ New features
|
|||
of AttributeSource.cloneAttributes() instances and the new copyTo() method.
|
||||
(Steven Rowe via Uwe Schindler)
|
||||
|
||||
* LUCENE-2413: Consolidated Solr analysis components into contrib/analyzers.
|
||||
New features from Solr now available to Lucene users include:
|
||||
- o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
|
||||
and phrases.
|
||||
(... in progress)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
* See the License for the specific language governing permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.commongrams;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.commongrams;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -22,7 +22,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
|
||||
import static org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;
|
||||
|
||||
/**
|
||||
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Construct n-grams for frequently occurring terms and phrases.
|
||||
</body>
|
||||
</html>
|
|
@ -14,28 +14,29 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.commongrams;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* Tests CommonGramsQueryFilter
|
||||
* Tests CommonGrams(Query)Filter
|
||||
*/
|
||||
public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||
public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
||||
private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
|
||||
"of" };
|
||||
|
||||
public void testReset() throws Exception {
|
||||
final String input = "How the s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
|
||||
CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
|
||||
|
@ -56,7 +57,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
|
||||
public void testQueryReset() throws Exception {
|
||||
final String input = "How the s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
|
||||
|
@ -88,7 +89,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String field, Reader in) {
|
||||
return new CommonGramsQueryFilter(new CommonGramsFilter(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords));
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -157,7 +158,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String field, Reader in) {
|
||||
return new CommonGramsFilter(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords);
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -243,7 +244,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testCaseSensitive() throws Exception {
|
||||
final String input = "How The s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
Set common = CommonGramsFilter.makeCommonSet(commonWords);
|
||||
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
|
||||
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
|
||||
|
@ -256,7 +257,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testLastWordisStopWord() throws Exception {
|
||||
final String input = "dog the";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "dog_the" });
|
||||
|
@ -267,7 +268,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testFirstWordisStopWord() throws Exception {
|
||||
final String input = "the dog";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the_dog" });
|
||||
|
@ -278,7 +279,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testOneWordQueryStopWord() throws Exception {
|
||||
final String input = "the";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the" });
|
||||
|
@ -289,7 +290,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testOneWordQuery() throws Exception {
|
||||
final String input = "monster";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "monster" });
|
||||
|
@ -300,7 +301,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
|
|||
*/
|
||||
public void TestFirstAndLastStopWord() throws Exception {
|
||||
final String input = "the of";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the_of" });
|
|
@ -22,6 +22,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
|
|
|
@ -23,6 +23,8 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
|
|
Loading…
Reference in New Issue