LUCENE-6177: Add CustomAnalyzer that allows to configure analyzers like you do in Solr's index schema. This class has a builder API to configure Tokenizers, TokenFilters, and CharFilters based on their SPI names and parameters as documented by the corresponding factories.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1651681 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2015-01-14 14:32:31 +00:00
parent 71f698c543
commit 7dff28a7bb
7 changed files with 665 additions and 4 deletions

View File

@ -143,6 +143,12 @@ New Features
* LUCENE-6166: Deletions (alone) can now trigger new merges. (Mike McCandless)
* LUCENE-6177: Add CustomAnalyzer that allows to configure analyzers
like you do in Solr's index schema. This class has a builder API to configure
Tokenizers, TokenFilters, and CharFilters based on their SPI names
and parameters as documented by the corresponding factories.
(Uwe Schindler)
Optimizations
* LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to

View File

@ -0,0 +1,323 @@
package org.apache.lucene.analysis.custom;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.SetOnce;
import org.apache.lucene.util.Version;
/**
* A general-purpose Analyzer that can be created with a builder-style API.
* Under the hood it uses the factory classes {@link TokenizerFactory},
* {@link TokenFilterFactory}, and {@link CharFilterFactory}.
* <p>You can create an instance of this Analyzer using the builder:
* <pre class="prettyprint">
* Analyzer ana = CustomAnalyzer.builder(Path.get(&quot;/path/to/config/dir&quot;))
* .withTokenizer(&quot;standard&quot;)
* .addTokenFilter(&quot;standard&quot;)
* .addTokenFilter(&quot;lowercase&quot;)
* .addTokenFilter(&quot;stop&quot;, &quot;ignoreCase&quot;, &quot;false&quot;, &quot;words&quot;, &quot;stopwords.txt&quot;, &quot;format&quot;, &quot;wordset&quot;)
* .build();
* </pre>
* The parameters passed to components are also used by Apache Solr and are documented
* on their corresponding factory classes. Refer to documentation of subclasses
* of {@link TokenizerFactory}, {@link TokenFilterFactory}, and {@link CharFilterFactory}.
* <p>The list of names to be used for components can be looked up through:
* {@link TokenizerFactory#availableTokenizers()}, {@link TokenFilterFactory#availableTokenFilters()},
* and {@link CharFilterFactory#availableCharFilters()}.
*/
public final class CustomAnalyzer extends Analyzer {
/** Returns a builder for custom analyzers that loads all resources from classpath.
* All path names given must be absolute with package prefixes. */
public static Builder builder() {
return builder(new ClasspathResourceLoader());
}
/** Returns a builder for custom analyzers that loads all resources from the given
* file system base directory. Place, e.g., stop word files there.
* Files that are not in the given directory are loaded from classpath. */
public static Builder builder(Path configDir) {
return builder(new FilesystemResourceLoader(configDir));
}
/** Returns a builder for custom analyzers that loads all resources using the given {@link ResourceLoader}. */
public static Builder builder(ResourceLoader loader) {
return new Builder(loader);
}
private final CharFilterFactory[] charFilters;
private final TokenizerFactory tokenizer;
private final TokenFilterFactory[] tokenFilters;
private final Integer posIncGap, offsetGap;
CustomAnalyzer(Version defaultMatchVersion, CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] tokenFilters, Integer posIncGap, Integer offsetGap) {
this.charFilters = charFilters;
this.tokenizer = tokenizer;
this.tokenFilters = tokenFilters;
this.posIncGap = posIncGap;
this.offsetGap = offsetGap;
if (defaultMatchVersion != null) {
setVersion(defaultMatchVersion);
}
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
for (final CharFilterFactory charFilter : charFilters) {
reader = charFilter.create(reader);
}
return reader;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tk = tokenizer.create();
TokenStream ts = tk;
for (final TokenFilterFactory filter : tokenFilters) {
ts = filter.create(ts);
}
return new TokenStreamComponents(tk, ts);
}
@Override
public int getPositionIncrementGap(String fieldName) {
// use default from Analyzer base class if null
return (posIncGap == null) ? super.getPositionIncrementGap(fieldName) : posIncGap.intValue();
}
@Override
public int getOffsetGap(String fieldName) {
// use default from Analyzer base class if null
return (offsetGap == null) ? super.getOffsetGap(fieldName) : offsetGap.intValue();
}
/** Returns the list of char filters that are used in this analyzer. */
public List<CharFilterFactory> getCharFilterFactories() {
return Collections.unmodifiableList(Arrays.asList(charFilters));
}
/** Returns the tokenizer that is used in this analyzer. */
public TokenizerFactory getTokenizerFactory() {
return tokenizer;
}
/** Returns the list of token filters that are used in this analyzer. */
public List<TokenFilterFactory> getTokenFilterFactories() {
return Collections.unmodifiableList(Arrays.asList(tokenFilters));
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder(getClass().getSimpleName()).append('(');
for (final CharFilterFactory filter : charFilters) {
sb.append(filter).append(',');
}
sb.append(tokenizer);
for (final TokenFilterFactory filter : tokenFilters) {
sb.append(',').append(filter);
}
return sb.append(')').toString();
}
/** Builder for {@link CustomAnalyzer}.
* @see CustomAnalyzer#builder()
* @see CustomAnalyzer#builder(Path)
* @see CustomAnalyzer#builder(ResourceLoader)
*/
public static final class Builder {
private final ResourceLoader loader;
private final SetOnce<Version> defaultMatchVersion = new SetOnce<>();
private final List<CharFilterFactory> charFilters = new ArrayList<>();
private final SetOnce<TokenizerFactory> tokenizer = new SetOnce<>();
private final List<TokenFilterFactory> tokenFilters = new ArrayList<>();
private final SetOnce<Integer> posIncGap = new SetOnce<>();
private final SetOnce<Integer> offsetGap = new SetOnce<>();
private boolean componentsAdded = false;
Builder(ResourceLoader loader) {
this.loader = loader;
}
/** This match version is passed as default to all tokenizers or filters. It is used unless you
* pass the parameter {code luceneMatchVersion} explicitly. It defaults to undefined, so the
* underlying factory will (in most cases) use {@link Version#LATEST}. */
public Builder withDefaultMatchVersion(Version version) {
Objects.requireNonNull(version, "version may not be null");
if (componentsAdded) {
throw new IllegalStateException("You may only set the default match version before adding tokenizers, "+
"token filters, or char filters.");
}
this.defaultMatchVersion.set(version);
return this;
}
/** Sets the position increment gap of the analyzer.
* The default is defined in the analyzer base class.
* @see Analyzer#getPositionIncrementGap(String)
*/
public Builder withPositionIncrementGap(int posIncGap) {
if (posIncGap < 0) {
throw new IllegalArgumentException("posIncGap must be >= 0");
}
this.posIncGap.set(posIncGap);
return this;
}
/** Sets the offset gap of the analyzer. The default is defined
* in the analyzer base class.
* @see Analyzer#getOffsetGap(String)
*/
public Builder withOffsetGap(int offsetGap) {
if (offsetGap < 0) {
throw new IllegalArgumentException("offsetGap must be >= 0");
}
this.offsetGap.set(offsetGap);
return this;
}
/** Uses the given tokenizer.
* @param name is used to look up the factory with {@link TokenizerFactory#forName(String, Map)}.
* The list of possible names can be looked up with {@link TokenizerFactory#availableTokenizers()}.
* @param params a list of factory string params as key/value pairs.
* The number of parameters must be an even number, as they are pairs.
*/
public Builder withTokenizer(String name, String... params) throws IOException {
return withTokenizer(name, paramsToMap(params));
}
/** Uses the given tokenizer.
* @param name is used to look up the factory with {@link TokenizerFactory#forName(String, Map)}.
* The list of possible names can be looked up with {@link TokenizerFactory#availableTokenizers()}.
* @param params the map of parameters to be passed to factory. The map must be modifiable.
*/
public Builder withTokenizer(String name, Map<String,String> params) throws IOException {
Objects.requireNonNull(name, "Tokenizer name may not be null");
tokenizer.set(applyResourceLoader(TokenizerFactory.forName(name, applyDefaultParams(params))));
componentsAdded = true;
return this;
}
/** Adds the given token filter.
* @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}.
* The list of possible names can be looked up with {@link TokenFilterFactory#availableTokenFilters()}.
* @param params a list of factory string params as key/value pairs.
* The number of parameters must be an even number, as they are pairs.
*/
public Builder addTokenFilter(String name, String... params) throws IOException {
return addTokenFilter(name, paramsToMap(params));
}
/** Adds the given token filter.
* @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}.
* The list of possible names can be looked up with {@link TokenFilterFactory#availableTokenFilters()}.
* @param params the map of parameters to be passed to factory. The map must be modifiable.
*/
public Builder addTokenFilter(String name, Map<String,String> params) throws IOException {
Objects.requireNonNull(name, "TokenFilter name may not be null");
tokenFilters.add(applyResourceLoader(TokenFilterFactory.forName(name, applyDefaultParams(params))));
componentsAdded = true;
return this;
}
/** Adds the given char filter.
* @param name is used to look up the factory with {@link CharFilterFactory#forName(String, Map)}.
* The list of possible names can be looked up with {@link CharFilterFactory#availableCharFilters()}.
* @param params a list of factory string params as key/value pairs.
* The number of parameters must be an even number, as they are pairs.
*/
public Builder addCharFilter(String name, String... params) throws IOException {
return addCharFilter(name, paramsToMap(params));
}
/** Adds the given char filter.
* @param name is used to look up the factory with {@link CharFilterFactory#forName(String, Map)}.
* The list of possible names can be looked up with {@link CharFilterFactory#availableCharFilters()}.
* @param params the map of parameters to be passed to factory. The map must be modifiable.
*/
public Builder addCharFilter(String name, Map<String,String> params) throws IOException {
Objects.requireNonNull(name, "CharFilter name may not be null");
charFilters.add(applyResourceLoader(CharFilterFactory.forName(name, applyDefaultParams(params))));
componentsAdded = true;
return this;
}
/** Builds the analyzer. */
public CustomAnalyzer build() {
if (tokenizer.get() == null) {
throw new IllegalStateException("You have to set at least a tokenizer.");
}
return new CustomAnalyzer(
defaultMatchVersion.get(),
charFilters.toArray(new CharFilterFactory[charFilters.size()]),
tokenizer.get(),
tokenFilters.toArray(new TokenFilterFactory[tokenFilters.size()]),
posIncGap.get(),
offsetGap.get()
);
}
private Map<String,String> applyDefaultParams(Map<String,String> map) {
if (defaultMatchVersion.get() != null && !map.containsKey(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM)) {
map.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, defaultMatchVersion.get().toString());
}
return map;
}
private Map<String, String> paramsToMap(String... params) {
if (params.length % 2 != 0) {
throw new IllegalArgumentException("Key-value pairs expected, so the number of params must be even.");
}
final Map<String, String> map = new HashMap<>();
for (int i = 0; i < params.length; i += 2) {
Objects.requireNonNull(params[i], "Key of param may not be null.");
map.put(params[i], params[i + 1]);
}
return map;
}
private <T> T applyResourceLoader(T factory) throws IOException {
if (factory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) factory).inform(loader);
}
return factory;
}
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
A general-purpose Analyzer that can be created with a builder-style API.
</body>
</html>

View File

@ -27,7 +27,8 @@
This module contains concrete components ({@link org.apache.lucene.analysis.CharFilter}s,
{@link org.apache.lucene.analysis.Tokenizer}s, and ({@link org.apache.lucene.analysis.TokenFilter}s) for
analyzing different types of content. It also provides a number of {@link org.apache.lucene.analysis.Analyzer}s
for different languages that you can use to get started quickly.
for different languages that you can use to get started quickly. To define fully custom Analyzers
(like in the index schema of Apache Solr), this module provides {@link org.apache.lucene.analysis.custom.CustomAnalyzer}.
</p>
</body>
</html>

View File

@ -0,0 +1,289 @@
package org.apache.lucene.analysis.custom;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.nio.file.Paths;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
import org.apache.lucene.analysis.standard.ClassicTokenizerFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.util.SetOnce.AlreadySetException;
import org.apache.lucene.util.Version;
public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
// Test some examples (TODO: we only check behavior, we may need something like TestRandomChains...)
public void testWhitespaceWithFolding() throws Exception {
CustomAnalyzer a = CustomAnalyzer.builder()
.withTokenizer("whitespace")
.addTokenFilter("asciifolding", "preserveOriginal", "true")
.addTokenFilter("lowercase")
.build();
assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
assertEquals(Collections.emptyList(), a.getCharFilterFactories());
List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
assertEquals(2, tokenFilters.size());
assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
assertEquals(0, a.getPositionIncrementGap("dummy"));
assertEquals(1, a.getOffsetGap("dummy"));
assertSame(Version.LATEST, a.getVersion());
assertAnalyzesTo(a, "foo bar FOO BAR",
new String[] { "foo", "bar", "foo", "bar" },
new int[] { 1, 1, 1, 1});
assertAnalyzesTo(a, "föó bär FÖÖ BAR",
new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
new int[] { 1, 0, 1, 0, 1, 0, 1});
}
public void testHtmlStripClassicFolding() throws Exception {
CustomAnalyzer a = CustomAnalyzer.builder()
.withDefaultMatchVersion(Version.LUCENE_5_0_0)
.addCharFilter("htmlstrip")
.withTokenizer("classic")
.addTokenFilter("asciifolding", "preserveOriginal", "true")
.addTokenFilter("lowercase")
.withPositionIncrementGap(100)
.withOffsetGap(1000)
.build();
assertSame(ClassicTokenizerFactory.class, a.getTokenizerFactory().getClass());
List<CharFilterFactory> charFilters = a.getCharFilterFactories();
assertEquals(1, charFilters.size());
assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass());
List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
assertEquals(2, tokenFilters.size());
assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
assertEquals(100, a.getPositionIncrementGap("dummy"));
assertEquals(1000, a.getOffsetGap("dummy"));
assertSame(Version.LUCENE_5_0_0, a.getVersion());
assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR",
new String[] { "foo", "bar", "foo", "bar" },
new int[] { 1, 1, 1, 1});
assertAnalyzesTo(a, "<p><b>föó</b> bär FÖÖ BAR</p>",
new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
new int[] { 1, 0, 1, 0, 1, 0, 1});
}
public void testStopWordsFromClasspath() throws Exception {
CustomAnalyzer a = CustomAnalyzer.builder()
.withTokenizer("whitespace")
.addTokenFilter("stop",
"ignoreCase", "true",
"words", "org/apache/lucene/analysis/custom/teststop.txt",
"format", "wordset")
.build();
assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
assertEquals(Collections.emptyList(), a.getCharFilterFactories());
List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
assertEquals(1, tokenFilters.size());
assertSame(StopFilterFactory.class, tokenFilters.get(0).getClass());
assertEquals(0, a.getPositionIncrementGap("dummy"));
assertEquals(1, a.getOffsetGap("dummy"));
assertSame(Version.LATEST, a.getVersion());
assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
}
public void testStopWordsFromClasspathWithMap() throws Exception {
Map<String,String> stopConfig1 = new HashMap<>();
stopConfig1.put("ignoreCase", "true");
stopConfig1.put("words", "org/apache/lucene/analysis/custom/teststop.txt");
stopConfig1.put("format", "wordset");
Map<String,String> stopConfig2 = Collections.unmodifiableMap(new HashMap<>(stopConfig1));
CustomAnalyzer a = CustomAnalyzer.builder()
.withTokenizer("whitespace")
.addTokenFilter("stop", stopConfig1)
.build();
assertTrue(stopConfig1.isEmpty());
assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
// try with unmodifiableMap, should fail
try {
CustomAnalyzer.builder()
.withTokenizer("whitespace")
.addTokenFilter("stop", stopConfig2)
.build();
fail();
} catch (IllegalArgumentException | UnsupportedOperationException e) {
// pass
}
}
public void testStopWordsFromFile() throws Exception {
CustomAnalyzer a = CustomAnalyzer.builder(this.getDataPath(""))
.withTokenizer("whitespace")
.addTokenFilter("stop",
"ignoreCase", "true",
"words", "teststop.txt",
"format", "wordset")
.build();
assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
}
public void testStopWordsFromFileAbsolute() throws Exception {
CustomAnalyzer a = CustomAnalyzer.builder(Paths.get("."))
.withTokenizer("whitespace")
.addTokenFilter("stop",
"ignoreCase", "true",
"words", this.getDataPath("teststop.txt").toString(),
"format", "wordset")
.build();
assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
}
// Now test misconfigurations:
public void testIncorrectOrder() throws Exception {
try {
CustomAnalyzer.builder()
.addCharFilter("htmlstrip")
.withDefaultMatchVersion(Version.LATEST)
.withTokenizer("whitespace")
.build();
fail();
} catch (IllegalStateException e) {
// pass
}
}
public void testMissingSPI() throws Exception {
try {
CustomAnalyzer.builder()
.withTokenizer("foobar_nonexistent")
.build();
fail();
} catch (IllegalArgumentException e) {
assertTrue(e.getMessage().contains("SPI"));
assertTrue(e.getMessage().contains("does not exist"));
}
}
public void testSetTokenizerTwice() throws Exception {
try {
CustomAnalyzer.builder()
.withTokenizer("whitespace")
.withTokenizer("standard")
.build();
fail();
} catch (AlreadySetException e) {
// pass
}
}
public void testSetMatchVersionTwice() throws Exception {
try {
CustomAnalyzer.builder()
.withDefaultMatchVersion(Version.LATEST)
.withDefaultMatchVersion(Version.LATEST)
.withTokenizer("standard")
.build();
fail();
} catch (AlreadySetException e) {
// pass
}
}
public void testSetPosIncTwice() throws Exception {
try {
CustomAnalyzer.builder()
.withPositionIncrementGap(2)
.withPositionIncrementGap(3)
.withTokenizer("standard")
.build();
fail();
} catch (AlreadySetException e) {
// pass
}
}
public void testSetOfsGapTwice() throws Exception {
try {
CustomAnalyzer.builder()
.withOffsetGap(2)
.withOffsetGap(3)
.withTokenizer("standard")
.build();
fail();
} catch (AlreadySetException e) {
// pass
}
}
public void testNoTokenizer() throws Exception {
try {
CustomAnalyzer.builder().build();
fail();
} catch (IllegalStateException e) {
assertTrue(e.getMessage().equals("You have to set at least a tokenizer."));
}
}
public void testNullTokenizer() throws Exception {
try {
CustomAnalyzer.builder()
.withTokenizer(null)
.build();
fail();
} catch (NullPointerException e) {
// pass
}
}
public void testNullParamKey() throws Exception {
try {
CustomAnalyzer.builder()
.withTokenizer("whitespace", null, "foo")
.build();
fail();
} catch (NullPointerException e) {
// pass
}
}
public void testNullMatchVersion() throws Exception {
try {
CustomAnalyzer.builder()
.withDefaultMatchVersion(null)
.withTokenizer("whitespace")
.build();
fail();
} catch (NullPointerException e) {
// pass
}
}
}

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
foo
bar

View File

@ -164,7 +164,11 @@ and proximity searches (though sentence identification is not provided by Lucene
supplies a large family of <code>Analyzer</code> classes that deliver useful
analysis chains. The most common of these is the <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.
Many applications will have a long and industrious life with nothing more
than the <code>StandardAnalyzer</code>.
than the <code>StandardAnalyzer</code>. The <a href="{@docRoot}/../analyzers-common/overview-summary.html">analyzers-common</a>
library provides many pre-existing analyzers for various languages.
The analysis-common library also allows to configure a custom Analyzer without subclassing using the
<a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/custom/CustomAnalyzer.html">CustomAnalyzer</a>
class.
</p>
<p>
Aside from the <code>StandardAnalyzer</code>,
@ -258,8 +262,7 @@ and proximity searches (though sentence identification is not provided by Lucene
create, or a combination of existing and newly created components. Before
pursuing this approach, you may find it worthwhile to explore the
<a href="{@docRoot}/../analyzers-common/overview-summary.html">analyzers-common</a> library and/or ask on the
<a href="http://lucene.apache.org/core/discussion.html"
>java-user@lucene.apache.org mailing list</a> first to see if what you
<a href="http://lucene.apache.org/core/discussion.html">java-user@lucene.apache.org mailing list</a> first to see if what you
need already exists. If you are still committed to creating your own
Analyzer, have a look at the source code of any one of the many samples
located in this package.