mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-13 00:15:47 +00:00
Add max_token_length
setting to the CharGroupTokenizer (#56860)
Adds `max_token_length` option to the CharGroupTokenizer. Updates documentation as well to reflect the changes. Closes #56676
This commit is contained in:
parent
ee4ce8ecec
commit
19a336e8d3
@ -18,6 +18,10 @@ The `char_group` tokenizer accepts one parameter:
|
|||||||
characters like e.g. `-`, or character groups: `whitespace`, `letter`, `digit`,
|
characters like e.g. `-`, or character groups: `whitespace`, `letter`, `digit`,
|
||||||
`punctuation`, `symbol`.
|
`punctuation`, `symbol`.
|
||||||
|
|
||||||
|
`max_token_length`::
|
||||||
|
The maximum token length. If a token is seen that exceeds this length then
|
||||||
|
it is split at `max_token_length` intervals. Defaults to `255`.
|
||||||
|
|
||||||
|
|
||||||
[float]
|
[float]
|
||||||
=== Example output
|
=== Example output
|
||||||
|
@ -21,6 +21,7 @@ package org.elasticsearch.analysis.common;
|
|||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||||
|
import org.apache.lucene.util.AttributeFactory;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
@ -31,7 +32,10 @@ import java.util.Set;
|
|||||||
|
|
||||||
public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
|
public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
|
||||||
|
|
||||||
|
static final String MAX_TOKEN_LENGTH = "max_token_length";
|
||||||
|
|
||||||
private final Set<Integer> tokenizeOnChars = new HashSet<>();
|
private final Set<Integer> tokenizeOnChars = new HashSet<>();
|
||||||
|
private final Integer maxTokenLength;
|
||||||
private boolean tokenizeOnSpace = false;
|
private boolean tokenizeOnSpace = false;
|
||||||
private boolean tokenizeOnLetter = false;
|
private boolean tokenizeOnLetter = false;
|
||||||
private boolean tokenizeOnDigit = false;
|
private boolean tokenizeOnDigit = false;
|
||||||
@ -41,6 +45,8 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
|
|||||||
public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||||
super(indexSettings, settings, name);
|
super(indexSettings, settings, name);
|
||||||
|
|
||||||
|
maxTokenLength = settings.getAsInt(MAX_TOKEN_LENGTH, CharTokenizer.DEFAULT_MAX_WORD_LEN);
|
||||||
|
|
||||||
for (final String c : settings.getAsList("tokenize_on_chars")) {
|
for (final String c : settings.getAsList("tokenize_on_chars")) {
|
||||||
if (c == null || c.length() == 0) {
|
if (c == null || c.length() == 0) {
|
||||||
throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters");
|
throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters");
|
||||||
@ -110,7 +116,7 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Tokenizer create() {
|
public Tokenizer create() {
|
||||||
return new CharTokenizer() {
|
return new CharTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, maxTokenLength) {
|
||||||
@Override
|
@Override
|
||||||
protected boolean isTokenChar(int c) {
|
protected boolean isTokenChar(int c) {
|
||||||
if (tokenizeOnSpace && Character.isWhitespace(c)) {
|
if (tokenizeOnSpace && Character.isWhitespace(c)) {
|
||||||
|
@ -19,7 +19,9 @@
|
|||||||
|
|
||||||
package org.elasticsearch.analysis.common;
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.index.Index;
|
import org.elasticsearch.index.Index;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
@ -27,11 +29,12 @@ import org.elasticsearch.test.ESTokenStreamTestCase;
|
|||||||
import org.elasticsearch.test.IndexSettingsModule;
|
import org.elasticsearch.test.IndexSettingsModule;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
|
||||||
public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
|
public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
|
||||||
|
|
||||||
public void testParseTokenChars() {
|
public void testParseTokenChars() {
|
||||||
final Index index = new Index("test", "_na_");
|
final Index index = new Index("test", "_na_");
|
||||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||||
@ -61,6 +64,53 @@ public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMaxTokenLength() throws IOException {
|
||||||
|
final Index index = new Index("test", "_na_");
|
||||||
|
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||||
|
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
|
||||||
|
final String name = "cg";
|
||||||
|
|
||||||
|
String[] conf = new String[] {"-"};
|
||||||
|
|
||||||
|
final Settings defaultLengthSettings = newAnalysisSettingsBuilder()
|
||||||
|
.putList("tokenize_on_chars", conf)
|
||||||
|
.build();
|
||||||
|
CharTokenizer tokenizer = (CharTokenizer) new CharGroupTokenizerFactory(indexProperties, null, name, defaultLengthSettings)
|
||||||
|
.create();
|
||||||
|
String textWithVeryLongToken = RandomStrings.randomAsciiAlphanumOfLength(random(), 256).concat("-trailing");
|
||||||
|
try (Reader reader = new StringReader(textWithVeryLongToken)) {
|
||||||
|
tokenizer.setReader(reader);
|
||||||
|
assertTokenStreamContents(tokenizer, new String[] { textWithVeryLongToken.substring(0, 255),
|
||||||
|
textWithVeryLongToken.substring(255, 256), "trailing"});
|
||||||
|
}
|
||||||
|
|
||||||
|
final Settings analysisSettings = newAnalysisSettingsBuilder()
|
||||||
|
.putList("tokenize_on_chars", conf)
|
||||||
|
.put("max_token_length", 2)
|
||||||
|
.build();
|
||||||
|
tokenizer = (CharTokenizer) new CharGroupTokenizerFactory(indexProperties, null, name, analysisSettings).create();
|
||||||
|
try (Reader reader = new StringReader("one-two-three")) {
|
||||||
|
tokenizer.setReader(reader);
|
||||||
|
assertTokenStreamContents(tokenizer, new String[] { "on", "e", "tw", "o", "th", "re", "e" });
|
||||||
|
}
|
||||||
|
|
||||||
|
final Settings tooLongLengthSettings = newAnalysisSettingsBuilder()
|
||||||
|
.putList("tokenize_on_chars", conf)
|
||||||
|
.put("max_token_length", 1024 * 1024 + 1)
|
||||||
|
.build();
|
||||||
|
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||||
|
() -> new CharGroupTokenizerFactory(indexProperties, null, name, tooLongLengthSettings).create());
|
||||||
|
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 1048577", e.getMessage());
|
||||||
|
|
||||||
|
final Settings negativeLengthSettings = newAnalysisSettingsBuilder()
|
||||||
|
.putList("tokenize_on_chars", conf)
|
||||||
|
.put("max_token_length", -1)
|
||||||
|
.build();
|
||||||
|
e = expectThrows(IllegalArgumentException.class,
|
||||||
|
() -> new CharGroupTokenizerFactory(indexProperties, null, name, negativeLengthSettings).create());
|
||||||
|
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
public void testTokenization() throws IOException {
|
public void testTokenization() throws IOException {
|
||||||
final Index index = new Index("test", "_na_");
|
final Index index = new Index("test", "_na_");
|
||||||
final String name = "cg";
|
final String name = "cg";
|
||||||
|
Loading…
x
Reference in New Issue
Block a user