mirror of https://github.com/apache/lucene.git
LUCENE-6177: Add CustomAnalyzer that allows to configure analyzers like you do in Solr's index schema. This class has a builder API to configure Tokenizers, TokenFilters, and CharFilters based on their SPI names and parameters as documented by the corresponding factories.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1651681 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
71f698c543
commit
7dff28a7bb
|
@ -143,6 +143,12 @@ New Features
|
|||
|
||||
* LUCENE-6166: Deletions (alone) can now trigger new merges. (Mike McCandless)
|
||||
|
||||
* LUCENE-6177: Add CustomAnalyzer that allows to configure analyzers
|
||||
like you do in Solr's index schema. This class has a builder API to configure
|
||||
Tokenizers, TokenFilters, and CharFilters based on their SPI names
|
||||
and parameters as documented by the corresponding factories.
|
||||
(Uwe Schindler)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to
|
||||
|
|
|
@ -0,0 +1,323 @@
|
|||
package org.apache.lucene.analysis.custom;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.SetOnce;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A general-purpose Analyzer that can be created with a builder-style API.
|
||||
* Under the hood it uses the factory classes {@link TokenizerFactory},
|
||||
* {@link TokenFilterFactory}, and {@link CharFilterFactory}.
|
||||
* <p>You can create an instance of this Analyzer using the builder:
|
||||
* <pre class="prettyprint">
|
||||
* Analyzer ana = CustomAnalyzer.builder(Path.get("/path/to/config/dir"))
|
||||
* .withTokenizer("standard")
|
||||
* .addTokenFilter("standard")
|
||||
* .addTokenFilter("lowercase")
|
||||
* .addTokenFilter("stop", "ignoreCase", "false", "words", "stopwords.txt", "format", "wordset")
|
||||
* .build();
|
||||
* </pre>
|
||||
* The parameters passed to components are also used by Apache Solr and are documented
|
||||
* on their corresponding factory classes. Refer to documentation of subclasses
|
||||
* of {@link TokenizerFactory}, {@link TokenFilterFactory}, and {@link CharFilterFactory}.
|
||||
* <p>The list of names to be used for components can be looked up through:
|
||||
* {@link TokenizerFactory#availableTokenizers()}, {@link TokenFilterFactory#availableTokenFilters()},
|
||||
* and {@link CharFilterFactory#availableCharFilters()}.
|
||||
*/
|
||||
public final class CustomAnalyzer extends Analyzer {
|
||||
|
||||
/** Returns a builder for custom analyzers that loads all resources from classpath.
|
||||
* All path names given must be absolute with package prefixes. */
|
||||
public static Builder builder() {
|
||||
return builder(new ClasspathResourceLoader());
|
||||
}
|
||||
|
||||
/** Returns a builder for custom analyzers that loads all resources from the given
|
||||
* file system base directory. Place, e.g., stop word files there.
|
||||
* Files that are not in the given directory are loaded from classpath. */
|
||||
public static Builder builder(Path configDir) {
|
||||
return builder(new FilesystemResourceLoader(configDir));
|
||||
}
|
||||
|
||||
/** Returns a builder for custom analyzers that loads all resources using the given {@link ResourceLoader}. */
|
||||
public static Builder builder(ResourceLoader loader) {
|
||||
return new Builder(loader);
|
||||
}
|
||||
|
||||
private final CharFilterFactory[] charFilters;
|
||||
private final TokenizerFactory tokenizer;
|
||||
private final TokenFilterFactory[] tokenFilters;
|
||||
private final Integer posIncGap, offsetGap;
|
||||
|
||||
CustomAnalyzer(Version defaultMatchVersion, CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] tokenFilters, Integer posIncGap, Integer offsetGap) {
|
||||
this.charFilters = charFilters;
|
||||
this.tokenizer = tokenizer;
|
||||
this.tokenFilters = tokenFilters;
|
||||
this.posIncGap = posIncGap;
|
||||
this.offsetGap = offsetGap;
|
||||
if (defaultMatchVersion != null) {
|
||||
setVersion(defaultMatchVersion);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
for (final CharFilterFactory charFilter : charFilters) {
|
||||
reader = charFilter.create(reader);
|
||||
}
|
||||
return reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer tk = tokenizer.create();
|
||||
TokenStream ts = tk;
|
||||
for (final TokenFilterFactory filter : tokenFilters) {
|
||||
ts = filter.create(ts);
|
||||
}
|
||||
return new TokenStreamComponents(tk, ts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
// use default from Analyzer base class if null
|
||||
return (posIncGap == null) ? super.getPositionIncrementGap(fieldName) : posIncGap.intValue();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOffsetGap(String fieldName) {
|
||||
// use default from Analyzer base class if null
|
||||
return (offsetGap == null) ? super.getOffsetGap(fieldName) : offsetGap.intValue();
|
||||
}
|
||||
|
||||
/** Returns the list of char filters that are used in this analyzer. */
|
||||
public List<CharFilterFactory> getCharFilterFactories() {
|
||||
return Collections.unmodifiableList(Arrays.asList(charFilters));
|
||||
}
|
||||
|
||||
/** Returns the tokenizer that is used in this analyzer. */
|
||||
public TokenizerFactory getTokenizerFactory() {
|
||||
return tokenizer;
|
||||
}
|
||||
|
||||
/** Returns the list of token filters that are used in this analyzer. */
|
||||
public List<TokenFilterFactory> getTokenFilterFactories() {
|
||||
return Collections.unmodifiableList(Arrays.asList(tokenFilters));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder sb = new StringBuilder(getClass().getSimpleName()).append('(');
|
||||
for (final CharFilterFactory filter : charFilters) {
|
||||
sb.append(filter).append(',');
|
||||
}
|
||||
sb.append(tokenizer);
|
||||
for (final TokenFilterFactory filter : tokenFilters) {
|
||||
sb.append(',').append(filter);
|
||||
}
|
||||
return sb.append(')').toString();
|
||||
}
|
||||
|
||||
/** Builder for {@link CustomAnalyzer}.
|
||||
* @see CustomAnalyzer#builder()
|
||||
* @see CustomAnalyzer#builder(Path)
|
||||
* @see CustomAnalyzer#builder(ResourceLoader)
|
||||
*/
|
||||
public static final class Builder {
|
||||
private final ResourceLoader loader;
|
||||
private final SetOnce<Version> defaultMatchVersion = new SetOnce<>();
|
||||
private final List<CharFilterFactory> charFilters = new ArrayList<>();
|
||||
private final SetOnce<TokenizerFactory> tokenizer = new SetOnce<>();
|
||||
private final List<TokenFilterFactory> tokenFilters = new ArrayList<>();
|
||||
private final SetOnce<Integer> posIncGap = new SetOnce<>();
|
||||
private final SetOnce<Integer> offsetGap = new SetOnce<>();
|
||||
|
||||
private boolean componentsAdded = false;
|
||||
|
||||
Builder(ResourceLoader loader) {
|
||||
this.loader = loader;
|
||||
}
|
||||
|
||||
/** This match version is passed as default to all tokenizers or filters. It is used unless you
|
||||
* pass the parameter {code luceneMatchVersion} explicitly. It defaults to undefined, so the
|
||||
* underlying factory will (in most cases) use {@link Version#LATEST}. */
|
||||
public Builder withDefaultMatchVersion(Version version) {
|
||||
Objects.requireNonNull(version, "version may not be null");
|
||||
if (componentsAdded) {
|
||||
throw new IllegalStateException("You may only set the default match version before adding tokenizers, "+
|
||||
"token filters, or char filters.");
|
||||
}
|
||||
this.defaultMatchVersion.set(version);
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the position increment gap of the analyzer.
|
||||
* The default is defined in the analyzer base class.
|
||||
* @see Analyzer#getPositionIncrementGap(String)
|
||||
*/
|
||||
public Builder withPositionIncrementGap(int posIncGap) {
|
||||
if (posIncGap < 0) {
|
||||
throw new IllegalArgumentException("posIncGap must be >= 0");
|
||||
}
|
||||
this.posIncGap.set(posIncGap);
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Sets the offset gap of the analyzer. The default is defined
|
||||
* in the analyzer base class.
|
||||
* @see Analyzer#getOffsetGap(String)
|
||||
*/
|
||||
public Builder withOffsetGap(int offsetGap) {
|
||||
if (offsetGap < 0) {
|
||||
throw new IllegalArgumentException("offsetGap must be >= 0");
|
||||
}
|
||||
this.offsetGap.set(offsetGap);
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Uses the given tokenizer.
|
||||
* @param name is used to look up the factory with {@link TokenizerFactory#forName(String, Map)}.
|
||||
* The list of possible names can be looked up with {@link TokenizerFactory#availableTokenizers()}.
|
||||
* @param params a list of factory string params as key/value pairs.
|
||||
* The number of parameters must be an even number, as they are pairs.
|
||||
*/
|
||||
public Builder withTokenizer(String name, String... params) throws IOException {
|
||||
return withTokenizer(name, paramsToMap(params));
|
||||
}
|
||||
|
||||
/** Uses the given tokenizer.
|
||||
* @param name is used to look up the factory with {@link TokenizerFactory#forName(String, Map)}.
|
||||
* The list of possible names can be looked up with {@link TokenizerFactory#availableTokenizers()}.
|
||||
* @param params the map of parameters to be passed to factory. The map must be modifiable.
|
||||
*/
|
||||
public Builder withTokenizer(String name, Map<String,String> params) throws IOException {
|
||||
Objects.requireNonNull(name, "Tokenizer name may not be null");
|
||||
tokenizer.set(applyResourceLoader(TokenizerFactory.forName(name, applyDefaultParams(params))));
|
||||
componentsAdded = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Adds the given token filter.
|
||||
* @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}.
|
||||
* The list of possible names can be looked up with {@link TokenFilterFactory#availableTokenFilters()}.
|
||||
* @param params a list of factory string params as key/value pairs.
|
||||
* The number of parameters must be an even number, as they are pairs.
|
||||
*/
|
||||
public Builder addTokenFilter(String name, String... params) throws IOException {
|
||||
return addTokenFilter(name, paramsToMap(params));
|
||||
}
|
||||
|
||||
/** Adds the given token filter.
|
||||
* @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}.
|
||||
* The list of possible names can be looked up with {@link TokenFilterFactory#availableTokenFilters()}.
|
||||
* @param params the map of parameters to be passed to factory. The map must be modifiable.
|
||||
*/
|
||||
public Builder addTokenFilter(String name, Map<String,String> params) throws IOException {
|
||||
Objects.requireNonNull(name, "TokenFilter name may not be null");
|
||||
tokenFilters.add(applyResourceLoader(TokenFilterFactory.forName(name, applyDefaultParams(params))));
|
||||
componentsAdded = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Adds the given char filter.
|
||||
* @param name is used to look up the factory with {@link CharFilterFactory#forName(String, Map)}.
|
||||
* The list of possible names can be looked up with {@link CharFilterFactory#availableCharFilters()}.
|
||||
* @param params a list of factory string params as key/value pairs.
|
||||
* The number of parameters must be an even number, as they are pairs.
|
||||
*/
|
||||
public Builder addCharFilter(String name, String... params) throws IOException {
|
||||
return addCharFilter(name, paramsToMap(params));
|
||||
}
|
||||
|
||||
/** Adds the given char filter.
|
||||
* @param name is used to look up the factory with {@link CharFilterFactory#forName(String, Map)}.
|
||||
* The list of possible names can be looked up with {@link CharFilterFactory#availableCharFilters()}.
|
||||
* @param params the map of parameters to be passed to factory. The map must be modifiable.
|
||||
*/
|
||||
public Builder addCharFilter(String name, Map<String,String> params) throws IOException {
|
||||
Objects.requireNonNull(name, "CharFilter name may not be null");
|
||||
charFilters.add(applyResourceLoader(CharFilterFactory.forName(name, applyDefaultParams(params))));
|
||||
componentsAdded = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Builds the analyzer. */
|
||||
public CustomAnalyzer build() {
|
||||
if (tokenizer.get() == null) {
|
||||
throw new IllegalStateException("You have to set at least a tokenizer.");
|
||||
}
|
||||
return new CustomAnalyzer(
|
||||
defaultMatchVersion.get(),
|
||||
charFilters.toArray(new CharFilterFactory[charFilters.size()]),
|
||||
tokenizer.get(),
|
||||
tokenFilters.toArray(new TokenFilterFactory[tokenFilters.size()]),
|
||||
posIncGap.get(),
|
||||
offsetGap.get()
|
||||
);
|
||||
}
|
||||
|
||||
private Map<String,String> applyDefaultParams(Map<String,String> map) {
|
||||
if (defaultMatchVersion.get() != null && !map.containsKey(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM)) {
|
||||
map.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, defaultMatchVersion.get().toString());
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
private Map<String, String> paramsToMap(String... params) {
|
||||
if (params.length % 2 != 0) {
|
||||
throw new IllegalArgumentException("Key-value pairs expected, so the number of params must be even.");
|
||||
}
|
||||
final Map<String, String> map = new HashMap<>();
|
||||
for (int i = 0; i < params.length; i += 2) {
|
||||
Objects.requireNonNull(params[i], "Key of param may not be null.");
|
||||
map.put(params[i], params[i + 1]);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
private <T> T applyResourceLoader(T factory) throws IOException {
|
||||
if (factory instanceof ResourceLoaderAware) {
|
||||
((ResourceLoaderAware) factory).inform(loader);
|
||||
}
|
||||
return factory;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
A general-purpose Analyzer that can be created with a builder-style API.
|
||||
</body>
|
||||
</html>
|
|
@ -27,7 +27,8 @@
|
|||
This module contains concrete components ({@link org.apache.lucene.analysis.CharFilter}s,
|
||||
{@link org.apache.lucene.analysis.Tokenizer}s, and ({@link org.apache.lucene.analysis.TokenFilter}s) for
|
||||
analyzing different types of content. It also provides a number of {@link org.apache.lucene.analysis.Analyzer}s
|
||||
for different languages that you can use to get started quickly.
|
||||
for different languages that you can use to get started quickly. To define fully custom Analyzers
|
||||
(like in the index schema of Apache Solr), this module provides {@link org.apache.lucene.analysis.custom.CustomAnalyzer}.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,289 @@
|
|||
package org.apache.lucene.analysis.custom;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
|
||||
import org.apache.lucene.analysis.core.StopFilterFactory;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
|
||||
import org.apache.lucene.analysis.standard.ClassicTokenizerFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.util.SetOnce.AlreadySetException;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
// Test some examples (TODO: we only check behavior, we may need something like TestRandomChains...)
|
||||
|
||||
public void testWhitespaceWithFolding() throws Exception {
|
||||
CustomAnalyzer a = CustomAnalyzer.builder()
|
||||
.withTokenizer("whitespace")
|
||||
.addTokenFilter("asciifolding", "preserveOriginal", "true")
|
||||
.addTokenFilter("lowercase")
|
||||
.build();
|
||||
|
||||
assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
|
||||
assertEquals(Collections.emptyList(), a.getCharFilterFactories());
|
||||
List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
|
||||
assertEquals(2, tokenFilters.size());
|
||||
assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
|
||||
assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
|
||||
assertEquals(0, a.getPositionIncrementGap("dummy"));
|
||||
assertEquals(1, a.getOffsetGap("dummy"));
|
||||
assertSame(Version.LATEST, a.getVersion());
|
||||
|
||||
assertAnalyzesTo(a, "foo bar FOO BAR",
|
||||
new String[] { "foo", "bar", "foo", "bar" },
|
||||
new int[] { 1, 1, 1, 1});
|
||||
assertAnalyzesTo(a, "föó bär FÖÖ BAR",
|
||||
new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1});
|
||||
}
|
||||
|
||||
public void testHtmlStripClassicFolding() throws Exception {
|
||||
CustomAnalyzer a = CustomAnalyzer.builder()
|
||||
.withDefaultMatchVersion(Version.LUCENE_5_0_0)
|
||||
.addCharFilter("htmlstrip")
|
||||
.withTokenizer("classic")
|
||||
.addTokenFilter("asciifolding", "preserveOriginal", "true")
|
||||
.addTokenFilter("lowercase")
|
||||
.withPositionIncrementGap(100)
|
||||
.withOffsetGap(1000)
|
||||
.build();
|
||||
|
||||
assertSame(ClassicTokenizerFactory.class, a.getTokenizerFactory().getClass());
|
||||
List<CharFilterFactory> charFilters = a.getCharFilterFactories();
|
||||
assertEquals(1, charFilters.size());
|
||||
assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass());
|
||||
List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
|
||||
assertEquals(2, tokenFilters.size());
|
||||
assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
|
||||
assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
|
||||
assertEquals(100, a.getPositionIncrementGap("dummy"));
|
||||
assertEquals(1000, a.getOffsetGap("dummy"));
|
||||
assertSame(Version.LUCENE_5_0_0, a.getVersion());
|
||||
|
||||
assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR",
|
||||
new String[] { "foo", "bar", "foo", "bar" },
|
||||
new int[] { 1, 1, 1, 1});
|
||||
assertAnalyzesTo(a, "<p><b>föó</b> bär FÖÖ BAR</p>",
|
||||
new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1});
|
||||
}
|
||||
|
||||
public void testStopWordsFromClasspath() throws Exception {
|
||||
CustomAnalyzer a = CustomAnalyzer.builder()
|
||||
.withTokenizer("whitespace")
|
||||
.addTokenFilter("stop",
|
||||
"ignoreCase", "true",
|
||||
"words", "org/apache/lucene/analysis/custom/teststop.txt",
|
||||
"format", "wordset")
|
||||
.build();
|
||||
|
||||
assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
|
||||
assertEquals(Collections.emptyList(), a.getCharFilterFactories());
|
||||
List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
|
||||
assertEquals(1, tokenFilters.size());
|
||||
assertSame(StopFilterFactory.class, tokenFilters.get(0).getClass());
|
||||
assertEquals(0, a.getPositionIncrementGap("dummy"));
|
||||
assertEquals(1, a.getOffsetGap("dummy"));
|
||||
assertSame(Version.LATEST, a.getVersion());
|
||||
|
||||
assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
|
||||
}
|
||||
|
||||
public void testStopWordsFromClasspathWithMap() throws Exception {
|
||||
Map<String,String> stopConfig1 = new HashMap<>();
|
||||
stopConfig1.put("ignoreCase", "true");
|
||||
stopConfig1.put("words", "org/apache/lucene/analysis/custom/teststop.txt");
|
||||
stopConfig1.put("format", "wordset");
|
||||
|
||||
Map<String,String> stopConfig2 = Collections.unmodifiableMap(new HashMap<>(stopConfig1));
|
||||
|
||||
CustomAnalyzer a = CustomAnalyzer.builder()
|
||||
.withTokenizer("whitespace")
|
||||
.addTokenFilter("stop", stopConfig1)
|
||||
.build();
|
||||
assertTrue(stopConfig1.isEmpty());
|
||||
assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
|
||||
|
||||
// try with unmodifiableMap, should fail
|
||||
try {
|
||||
CustomAnalyzer.builder()
|
||||
.withTokenizer("whitespace")
|
||||
.addTokenFilter("stop", stopConfig2)
|
||||
.build();
|
||||
fail();
|
||||
} catch (IllegalArgumentException | UnsupportedOperationException e) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
public void testStopWordsFromFile() throws Exception {
|
||||
CustomAnalyzer a = CustomAnalyzer.builder(this.getDataPath(""))
|
||||
.withTokenizer("whitespace")
|
||||
.addTokenFilter("stop",
|
||||
"ignoreCase", "true",
|
||||
"words", "teststop.txt",
|
||||
"format", "wordset")
|
||||
.build();
|
||||
assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
|
||||
}
|
||||
|
||||
public void testStopWordsFromFileAbsolute() throws Exception {
|
||||
CustomAnalyzer a = CustomAnalyzer.builder(Paths.get("."))
|
||||
.withTokenizer("whitespace")
|
||||
.addTokenFilter("stop",
|
||||
"ignoreCase", "true",
|
||||
"words", this.getDataPath("teststop.txt").toString(),
|
||||
"format", "wordset")
|
||||
.build();
|
||||
assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
|
||||
}
|
||||
|
||||
// Now test misconfigurations:
|
||||
|
||||
public void testIncorrectOrder() throws Exception {
|
||||
try {
|
||||
CustomAnalyzer.builder()
|
||||
.addCharFilter("htmlstrip")
|
||||
.withDefaultMatchVersion(Version.LATEST)
|
||||
.withTokenizer("whitespace")
|
||||
.build();
|
||||
fail();
|
||||
} catch (IllegalStateException e) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
public void testMissingSPI() throws Exception {
|
||||
try {
|
||||
CustomAnalyzer.builder()
|
||||
.withTokenizer("foobar_nonexistent")
|
||||
.build();
|
||||
fail();
|
||||
} catch (IllegalArgumentException e) {
|
||||
assertTrue(e.getMessage().contains("SPI"));
|
||||
assertTrue(e.getMessage().contains("does not exist"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testSetTokenizerTwice() throws Exception {
|
||||
try {
|
||||
CustomAnalyzer.builder()
|
||||
.withTokenizer("whitespace")
|
||||
.withTokenizer("standard")
|
||||
.build();
|
||||
fail();
|
||||
} catch (AlreadySetException e) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
public void testSetMatchVersionTwice() throws Exception {
|
||||
try {
|
||||
CustomAnalyzer.builder()
|
||||
.withDefaultMatchVersion(Version.LATEST)
|
||||
.withDefaultMatchVersion(Version.LATEST)
|
||||
.withTokenizer("standard")
|
||||
.build();
|
||||
fail();
|
||||
} catch (AlreadySetException e) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
public void testSetPosIncTwice() throws Exception {
|
||||
try {
|
||||
CustomAnalyzer.builder()
|
||||
.withPositionIncrementGap(2)
|
||||
.withPositionIncrementGap(3)
|
||||
.withTokenizer("standard")
|
||||
.build();
|
||||
fail();
|
||||
} catch (AlreadySetException e) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
public void testSetOfsGapTwice() throws Exception {
|
||||
try {
|
||||
CustomAnalyzer.builder()
|
||||
.withOffsetGap(2)
|
||||
.withOffsetGap(3)
|
||||
.withTokenizer("standard")
|
||||
.build();
|
||||
fail();
|
||||
} catch (AlreadySetException e) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
public void testNoTokenizer() throws Exception {
|
||||
try {
|
||||
CustomAnalyzer.builder().build();
|
||||
fail();
|
||||
} catch (IllegalStateException e) {
|
||||
assertTrue(e.getMessage().equals("You have to set at least a tokenizer."));
|
||||
}
|
||||
}
|
||||
|
||||
public void testNullTokenizer() throws Exception {
|
||||
try {
|
||||
CustomAnalyzer.builder()
|
||||
.withTokenizer(null)
|
||||
.build();
|
||||
fail();
|
||||
} catch (NullPointerException e) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
public void testNullParamKey() throws Exception {
|
||||
try {
|
||||
CustomAnalyzer.builder()
|
||||
.withTokenizer("whitespace", null, "foo")
|
||||
.build();
|
||||
fail();
|
||||
} catch (NullPointerException e) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
public void testNullMatchVersion() throws Exception {
|
||||
try {
|
||||
CustomAnalyzer.builder()
|
||||
.withDefaultMatchVersion(null)
|
||||
.withTokenizer("whitespace")
|
||||
.build();
|
||||
fail();
|
||||
} catch (NullPointerException e) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
foo
|
||||
bar
|
|
@ -164,7 +164,11 @@ and proximity searches (though sentence identification is not provided by Lucene
|
|||
supplies a large family of <code>Analyzer</code> classes that deliver useful
|
||||
analysis chains. The most common of these is the <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.
|
||||
Many applications will have a long and industrious life with nothing more
|
||||
than the <code>StandardAnalyzer</code>.
|
||||
than the <code>StandardAnalyzer</code>. The <a href="{@docRoot}/../analyzers-common/overview-summary.html">analyzers-common</a>
|
||||
library provides many pre-existing analyzers for various languages.
|
||||
The analysis-common library also allows to configure a custom Analyzer without subclassing using the
|
||||
<a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/custom/CustomAnalyzer.html">CustomAnalyzer</a>
|
||||
class.
|
||||
</p>
|
||||
<p>
|
||||
Aside from the <code>StandardAnalyzer</code>,
|
||||
|
@ -258,8 +262,7 @@ and proximity searches (though sentence identification is not provided by Lucene
|
|||
create, or a combination of existing and newly created components. Before
|
||||
pursuing this approach, you may find it worthwhile to explore the
|
||||
<a href="{@docRoot}/../analyzers-common/overview-summary.html">analyzers-common</a> library and/or ask on the
|
||||
<a href="http://lucene.apache.org/core/discussion.html"
|
||||
>java-user@lucene.apache.org mailing list</a> first to see if what you
|
||||
<a href="http://lucene.apache.org/core/discussion.html">java-user@lucene.apache.org mailing list</a> first to see if what you
|
||||
need already exists. If you are still committed to creating your own
|
||||
Analyzer, have a look at the source code of any one of the many samples
|
||||
located in this package.
|
||||
|
|
Loading…
Reference in New Issue