Run spotless and exclude checkstyle on modules module (#1442)
Signed-off-by: Owais Kazi <owaiskazi19@gmail.com>
This commit is contained in:
parent
3f6e1df9eb
commit
37ac3788a3
|
@ -25,6 +25,8 @@
|
|||
<suppress files="plugins[/\\]examples" checks="." />
|
||||
<!-- Excludes checkstyle run on libs module -->
|
||||
<suppress files="libs" checks="." />
|
||||
<!-- Excludes checkstyle run on modules module -->
|
||||
<suppress files="modules" checks="." />
|
||||
|
||||
<!--
|
||||
Truly temporary suppressions suppression of snippets included in
|
||||
|
|
|
@ -57,24 +57,6 @@ import org.opensearch.gradle.BuildPlugin
|
|||
|
||||
// Do not add new sub-projects here!
|
||||
def projectPathsToExclude = [
|
||||
':modules:aggs-matrix-stats',
|
||||
':modules:analysis-common',
|
||||
':modules:ingest-common',
|
||||
':modules:ingest-geoip',
|
||||
':modules:ingest-user-agent',
|
||||
':modules:lang-expression',
|
||||
':modules:lang-mustache',
|
||||
':modules:lang-painless',
|
||||
':modules:lang-painless:spi',
|
||||
':modules:mapper-extras',
|
||||
':modules:parent-join',
|
||||
':modules:percolator',
|
||||
':modules:rank-eval',
|
||||
':modules:reindex',
|
||||
':modules:repository-url',
|
||||
':modules:systemd',
|
||||
':modules:tasks',
|
||||
':modules:transport-netty4',
|
||||
':plugins:analysis-icu',
|
||||
':plugins:analysis-kuromoji',
|
||||
':plugins:analysis-nori',
|
||||
|
|
|
@ -45,7 +45,9 @@ import static java.util.Collections.singletonList;
|
|||
public class MatrixAggregationPlugin extends Plugin implements SearchPlugin {
|
||||
@Override
|
||||
public List<AggregationSpec> getAggregations() {
|
||||
return singletonList(new AggregationSpec(MatrixStatsAggregationBuilder.NAME, MatrixStatsAggregationBuilder::new,
|
||||
new MatrixStatsParser()).addResultReader(InternalMatrixStats::new));
|
||||
return singletonList(
|
||||
new AggregationSpec(MatrixStatsAggregationBuilder.NAME, MatrixStatsAggregationBuilder::new, new MatrixStatsParser())
|
||||
.addResultReader(InternalMatrixStats::new)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,8 +54,13 @@ public class InternalMatrixStats extends InternalAggregation implements MatrixSt
|
|||
private final MatrixStatsResults results;
|
||||
|
||||
/** per shard ctor */
|
||||
InternalMatrixStats(String name, long count, RunningStats multiFieldStatsResults, MatrixStatsResults results,
|
||||
Map<String, Object> metadata) {
|
||||
InternalMatrixStats(
|
||||
String name,
|
||||
long count,
|
||||
RunningStats multiFieldStatsResults,
|
||||
MatrixStatsResults results,
|
||||
Map<String, Object> metadata
|
||||
) {
|
||||
super(name, metadata);
|
||||
assert count >= 0;
|
||||
this.stats = multiFieldStatsResults;
|
||||
|
@ -248,7 +253,7 @@ public class InternalMatrixStats extends InternalAggregation implements MatrixSt
|
|||
public InternalAggregation reduce(List<InternalAggregation> aggregations, ReduceContext reduceContext) {
|
||||
// merge stats across all shards
|
||||
List<InternalAggregation> aggs = new ArrayList<>(aggregations);
|
||||
aggs.removeIf(p -> ((InternalMatrixStats)p).stats == null);
|
||||
aggs.removeIf(p -> ((InternalMatrixStats) p).stats == null);
|
||||
|
||||
// return empty result iff all stats are null
|
||||
if (aggs.isEmpty()) {
|
||||
|
@ -284,7 +289,6 @@ public class InternalMatrixStats extends InternalAggregation implements MatrixSt
|
|||
if (super.equals(obj) == false) return false;
|
||||
|
||||
InternalMatrixStats other = (InternalMatrixStats) obj;
|
||||
return Objects.equals(this.stats, other.stats) &&
|
||||
Objects.equals(this.results, other.results);
|
||||
return Objects.equals(this.stats, other.stats) && Objects.equals(this.results, other.results);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,18 +39,25 @@ import org.opensearch.search.aggregations.Aggregation;
|
|||
public interface MatrixStats extends Aggregation {
|
||||
/** return the total document count */
|
||||
long getDocCount();
|
||||
|
||||
/** return total field count (differs from docCount if there are missing values) */
|
||||
long getFieldCount(String field);
|
||||
|
||||
/** return the field mean */
|
||||
double getMean(String field);
|
||||
|
||||
/** return the field variance */
|
||||
double getVariance(String field);
|
||||
|
||||
/** return the skewness of the distribution */
|
||||
double getSkewness(String field);
|
||||
|
||||
/** return the kurtosis of the distribution */
|
||||
double getKurtosis(String field);
|
||||
|
||||
/** return the covariance between field x and field y */
|
||||
double getCovariance(String fieldX, String fieldY);
|
||||
|
||||
/** return the correlation coefficient of field x and field y */
|
||||
double getCorrelation(String fieldX, String fieldY);
|
||||
}
|
||||
|
|
|
@ -55,8 +55,11 @@ public class MatrixStatsAggregationBuilder extends ArrayValuesSourceAggregationB
|
|||
super(name);
|
||||
}
|
||||
|
||||
protected MatrixStatsAggregationBuilder(MatrixStatsAggregationBuilder clone,
|
||||
AggregatorFactories.Builder factoriesBuilder, Map<String, Object> metadata) {
|
||||
protected MatrixStatsAggregationBuilder(
|
||||
MatrixStatsAggregationBuilder clone,
|
||||
AggregatorFactories.Builder factoriesBuilder,
|
||||
Map<String, Object> metadata
|
||||
) {
|
||||
super(clone, factoriesBuilder, metadata);
|
||||
this.multiValueMode = clone.multiValueMode;
|
||||
}
|
||||
|
@ -88,10 +91,12 @@ public class MatrixStatsAggregationBuilder extends ArrayValuesSourceAggregationB
|
|||
}
|
||||
|
||||
@Override
|
||||
protected MatrixStatsAggregatorFactory innerBuild(QueryShardContext queryShardContext,
|
||||
Map<String, ValuesSourceConfig> configs,
|
||||
AggregatorFactory parent,
|
||||
AggregatorFactories.Builder subFactoriesBuilder) throws IOException {
|
||||
protected MatrixStatsAggregatorFactory innerBuild(
|
||||
QueryShardContext queryShardContext,
|
||||
Map<String, ValuesSourceConfig> configs,
|
||||
AggregatorFactory parent,
|
||||
AggregatorFactories.Builder subFactoriesBuilder
|
||||
) throws IOException {
|
||||
return new MatrixStatsAggregatorFactory(name, configs, multiValueMode, queryShardContext, parent, subFactoriesBuilder, metadata);
|
||||
}
|
||||
|
||||
|
|
|
@ -60,8 +60,14 @@ final class MatrixStatsAggregator extends MetricsAggregator {
|
|||
/** array of descriptive stats, per shard, needed to compute the correlation */
|
||||
ObjectArray<RunningStats> stats;
|
||||
|
||||
MatrixStatsAggregator(String name, Map<String, ValuesSource.Numeric> valuesSources, SearchContext context,
|
||||
Aggregator parent, MultiValueMode multiValueMode, Map<String,Object> metadata) throws IOException {
|
||||
MatrixStatsAggregator(
|
||||
String name,
|
||||
Map<String, ValuesSource.Numeric> valuesSources,
|
||||
SearchContext context,
|
||||
Aggregator parent,
|
||||
MultiValueMode multiValueMode,
|
||||
Map<String, Object> metadata
|
||||
) throws IOException {
|
||||
super(name, context, parent, metadata);
|
||||
if (valuesSources != null && !valuesSources.isEmpty()) {
|
||||
this.valuesSources = new ArrayValuesSource.NumericArrayValuesSource(valuesSources, multiValueMode);
|
||||
|
@ -77,8 +83,7 @@ final class MatrixStatsAggregator extends MetricsAggregator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx,
|
||||
final LeafBucketCollector sub) throws IOException {
|
||||
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, final LeafBucketCollector sub) throws IOException {
|
||||
if (valuesSources == null) {
|
||||
return LeafBucketCollector.NO_OP_COLLECTOR;
|
||||
}
|
||||
|
|
|
@ -51,36 +51,38 @@ final class MatrixStatsAggregatorFactory extends ArrayValuesSourceAggregatorFact
|
|||
|
||||
private final MultiValueMode multiValueMode;
|
||||
|
||||
MatrixStatsAggregatorFactory(String name,
|
||||
Map<String, ValuesSourceConfig> configs,
|
||||
MultiValueMode multiValueMode,
|
||||
QueryShardContext queryShardContext,
|
||||
AggregatorFactory parent,
|
||||
AggregatorFactories.Builder subFactoriesBuilder,
|
||||
Map<String, Object> metadata) throws IOException {
|
||||
MatrixStatsAggregatorFactory(
|
||||
String name,
|
||||
Map<String, ValuesSourceConfig> configs,
|
||||
MultiValueMode multiValueMode,
|
||||
QueryShardContext queryShardContext,
|
||||
AggregatorFactory parent,
|
||||
AggregatorFactories.Builder subFactoriesBuilder,
|
||||
Map<String, Object> metadata
|
||||
) throws IOException {
|
||||
super(name, configs, queryShardContext, parent, subFactoriesBuilder, metadata);
|
||||
this.multiValueMode = multiValueMode;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Aggregator createUnmapped(SearchContext searchContext,
|
||||
Aggregator parent,
|
||||
Map<String, Object> metadata)
|
||||
throws IOException {
|
||||
protected Aggregator createUnmapped(SearchContext searchContext, Aggregator parent, Map<String, Object> metadata) throws IOException {
|
||||
return new MatrixStatsAggregator(name, null, searchContext, parent, multiValueMode, metadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Aggregator doCreateInternal(Map<String, ValuesSource> valuesSources,
|
||||
SearchContext searchContext,
|
||||
Aggregator parent,
|
||||
CardinalityUpperBound cardinality,
|
||||
Map<String, Object> metadata) throws IOException {
|
||||
protected Aggregator doCreateInternal(
|
||||
Map<String, ValuesSource> valuesSources,
|
||||
SearchContext searchContext,
|
||||
Aggregator parent,
|
||||
CardinalityUpperBound cardinality,
|
||||
Map<String, Object> metadata
|
||||
) throws IOException {
|
||||
Map<String, ValuesSource.Numeric> typedValuesSources = new HashMap<>(valuesSources.size());
|
||||
for (Map.Entry<String, ValuesSource> entry : valuesSources.entrySet()) {
|
||||
if (entry.getValue() instanceof ValuesSource.Numeric == false) {
|
||||
throw new AggregationExecutionException("ValuesSource type " + entry.getValue().toString() +
|
||||
"is not supported for aggregation " + this.name());
|
||||
throw new AggregationExecutionException(
|
||||
"ValuesSource type " + entry.getValue().toString() + "is not supported for aggregation " + this.name()
|
||||
);
|
||||
}
|
||||
// TODO: There must be a better option than this.
|
||||
typedValuesSources.put(entry.getKey(), (ValuesSource.Numeric) entry.getValue());
|
||||
|
|
|
@ -49,8 +49,13 @@ public class MatrixStatsParser extends ArrayValuesSourceParser.NumericValuesSour
|
|||
}
|
||||
|
||||
@Override
|
||||
protected boolean token(String aggregationName, String currentFieldName, XContentParser.Token token, XContentParser parser,
|
||||
Map<ParseField, Object> otherOptions) throws IOException {
|
||||
protected boolean token(
|
||||
String aggregationName,
|
||||
String currentFieldName,
|
||||
XContentParser.Token token,
|
||||
XContentParser parser,
|
||||
Map<ParseField, Object> otherOptions
|
||||
) throws IOException {
|
||||
if (ArrayValuesSourceAggregationBuilder.MULTIVALUE_MODE_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
|
||||
if (token == XContentParser.Token.VALUE_STRING) {
|
||||
otherOptions.put(ArrayValuesSourceAggregationBuilder.MULTIVALUE_MODE_FIELD, parser.text());
|
||||
|
@ -61,10 +66,14 @@ public class MatrixStatsParser extends ArrayValuesSourceParser.NumericValuesSour
|
|||
}
|
||||
|
||||
@Override
|
||||
protected MatrixStatsAggregationBuilder createFactory(String aggregationName, ValuesSourceType valuesSourceType,
|
||||
ValueType targetValueType, Map<ParseField, Object> otherOptions) {
|
||||
protected MatrixStatsAggregationBuilder createFactory(
|
||||
String aggregationName,
|
||||
ValuesSourceType valuesSourceType,
|
||||
ValueType targetValueType,
|
||||
Map<ParseField, Object> otherOptions
|
||||
) {
|
||||
MatrixStatsAggregationBuilder builder = new MatrixStatsAggregationBuilder(aggregationName);
|
||||
String mode = (String)otherOptions.get(ArrayValuesSourceAggregationBuilder.MULTIVALUE_MODE_FIELD);
|
||||
String mode = (String) otherOptions.get(ArrayValuesSourceAggregationBuilder.MULTIVALUE_MODE_FIELD);
|
||||
if (mode != null) {
|
||||
builder.multiValueMode(MultiValueMode.fromString(mode));
|
||||
}
|
||||
|
|
|
@ -213,7 +213,7 @@ class MatrixStatsResults implements Writeable {
|
|||
// update skewness
|
||||
results.skewness.put(fieldName, Math.sqrt(results.docCount) * results.skewness.get(fieldName) / Math.pow(var, 1.5D));
|
||||
// update kurtosis
|
||||
results.kurtosis.put(fieldName, (double)results.docCount * results.kurtosis.get(fieldName) / (var * var));
|
||||
results.kurtosis.put(fieldName, (double) results.docCount * results.kurtosis.get(fieldName) / (var * var));
|
||||
// update variances
|
||||
results.variances.put(fieldName, results.variances.get(fieldName) / nM1);
|
||||
}
|
||||
|
@ -248,8 +248,7 @@ class MatrixStatsResults implements Writeable {
|
|||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
MatrixStatsResults that = (MatrixStatsResults) o;
|
||||
return Objects.equals(results, that.results) &&
|
||||
Objects.equals(correlation, that.correlation);
|
||||
return Objects.equals(results, that.results) && Objects.equals(correlation, that.correlation);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -164,8 +164,11 @@ public class ParsedMatrixStats extends ParsedAggregation implements MatrixStats
|
|||
return values.get(fieldName);
|
||||
}
|
||||
|
||||
private static final ObjectParser<ParsedMatrixStats, Void> PARSER =
|
||||
new ObjectParser<>(ParsedMatrixStats.class.getSimpleName(), true, ParsedMatrixStats::new);
|
||||
private static final ObjectParser<ParsedMatrixStats, Void> PARSER = new ObjectParser<>(
|
||||
ParsedMatrixStats.class.getSimpleName(),
|
||||
true,
|
||||
ParsedMatrixStats::new
|
||||
);
|
||||
static {
|
||||
declareAggregationFields(PARSER);
|
||||
PARSER.declareLong(ParsedMatrixStats::setDocCount, CommonFields.DOC_COUNT);
|
||||
|
@ -200,21 +203,27 @@ public class ParsedMatrixStats extends ParsedAggregation implements MatrixStats
|
|||
Map<String, Double> covariances;
|
||||
Map<String, Double> correlations;
|
||||
|
||||
private static final ObjectParser<ParsedMatrixStatsResult, Void> RESULT_PARSER =
|
||||
new ObjectParser<>(ParsedMatrixStatsResult.class.getSimpleName(), true, ParsedMatrixStatsResult::new);
|
||||
private static final ObjectParser<ParsedMatrixStatsResult, Void> RESULT_PARSER = new ObjectParser<>(
|
||||
ParsedMatrixStatsResult.class.getSimpleName(),
|
||||
true,
|
||||
ParsedMatrixStatsResult::new
|
||||
);
|
||||
static {
|
||||
RESULT_PARSER.declareString((result, name) -> result.name = name,
|
||||
new ParseField(InternalMatrixStats.Fields.NAME));
|
||||
RESULT_PARSER.declareLong((result, count) -> result.count = count,
|
||||
new ParseField(InternalMatrixStats.Fields.COUNT));
|
||||
RESULT_PARSER.declareDouble((result, mean) -> result.mean = mean,
|
||||
new ParseField(InternalMatrixStats.Fields.MEAN));
|
||||
RESULT_PARSER.declareDouble((result, variance) -> result.variance = variance,
|
||||
new ParseField(InternalMatrixStats.Fields.VARIANCE));
|
||||
RESULT_PARSER.declareDouble((result, skewness) -> result.skewness = skewness,
|
||||
new ParseField(InternalMatrixStats.Fields.SKEWNESS));
|
||||
RESULT_PARSER.declareDouble((result, kurtosis) -> result.kurtosis = kurtosis,
|
||||
new ParseField(InternalMatrixStats.Fields.KURTOSIS));
|
||||
RESULT_PARSER.declareString((result, name) -> result.name = name, new ParseField(InternalMatrixStats.Fields.NAME));
|
||||
RESULT_PARSER.declareLong((result, count) -> result.count = count, new ParseField(InternalMatrixStats.Fields.COUNT));
|
||||
RESULT_PARSER.declareDouble((result, mean) -> result.mean = mean, new ParseField(InternalMatrixStats.Fields.MEAN));
|
||||
RESULT_PARSER.declareDouble(
|
||||
(result, variance) -> result.variance = variance,
|
||||
new ParseField(InternalMatrixStats.Fields.VARIANCE)
|
||||
);
|
||||
RESULT_PARSER.declareDouble(
|
||||
(result, skewness) -> result.skewness = skewness,
|
||||
new ParseField(InternalMatrixStats.Fields.SKEWNESS)
|
||||
);
|
||||
RESULT_PARSER.declareDouble(
|
||||
(result, kurtosis) -> result.kurtosis = kurtosis,
|
||||
new ParseField(InternalMatrixStats.Fields.KURTOSIS)
|
||||
);
|
||||
|
||||
RESULT_PARSER.declareObject((ParsedMatrixStatsResult result, Map<String, Object> covars) -> {
|
||||
result.covariances = new LinkedHashMap<>(covars.size());
|
||||
|
|
|
@ -93,25 +93,25 @@ public class RunningStats implements Writeable, Cloneable {
|
|||
public RunningStats(StreamInput in) throws IOException {
|
||||
this();
|
||||
// read doc count
|
||||
docCount = (Long)in.readGenericValue();
|
||||
docCount = (Long) in.readGenericValue();
|
||||
// read fieldSum
|
||||
fieldSum = convertIfNeeded((Map<String, Double>)in.readGenericValue());
|
||||
fieldSum = convertIfNeeded((Map<String, Double>) in.readGenericValue());
|
||||
// counts
|
||||
counts = convertIfNeeded((Map<String, Long>)in.readGenericValue());
|
||||
counts = convertIfNeeded((Map<String, Long>) in.readGenericValue());
|
||||
// means
|
||||
means = convertIfNeeded((Map<String, Double>)in.readGenericValue());
|
||||
means = convertIfNeeded((Map<String, Double>) in.readGenericValue());
|
||||
// variances
|
||||
variances = convertIfNeeded((Map<String, Double>)in.readGenericValue());
|
||||
variances = convertIfNeeded((Map<String, Double>) in.readGenericValue());
|
||||
// skewness
|
||||
skewness = convertIfNeeded((Map<String, Double>)in.readGenericValue());
|
||||
skewness = convertIfNeeded((Map<String, Double>) in.readGenericValue());
|
||||
// kurtosis
|
||||
kurtosis = convertIfNeeded((Map<String, Double>)in.readGenericValue());
|
||||
kurtosis = convertIfNeeded((Map<String, Double>) in.readGenericValue());
|
||||
// read covariances
|
||||
covariances = convertIfNeeded((Map<String, HashMap<String, Double>>)in.readGenericValue());
|
||||
covariances = convertIfNeeded((Map<String, HashMap<String, Double>>) in.readGenericValue());
|
||||
}
|
||||
|
||||
// Convert Map to HashMap if it isn't
|
||||
private static <K, V> HashMap<K, V> convertIfNeeded(Map<K,V> map) {
|
||||
private static <K, V> HashMap<K, V> convertIfNeeded(Map<K, V> map) {
|
||||
if (map instanceof HashMap) {
|
||||
return (HashMap<K, V>) map;
|
||||
} else {
|
||||
|
@ -235,7 +235,7 @@ public class RunningStats implements Writeable, Cloneable {
|
|||
this.counts.put(fieldName, other.counts.get(fieldName).longValue());
|
||||
this.fieldSum.put(fieldName, other.fieldSum.get(fieldName).doubleValue());
|
||||
this.variances.put(fieldName, other.variances.get(fieldName).doubleValue());
|
||||
this.skewness.put(fieldName , other.skewness.get(fieldName).doubleValue());
|
||||
this.skewness.put(fieldName, other.skewness.get(fieldName).doubleValue());
|
||||
this.kurtosis.put(fieldName, other.kurtosis.get(fieldName).doubleValue());
|
||||
if (other.covariances.containsKey(fieldName)) {
|
||||
this.covariances.put(fieldName, other.covariances.get(fieldName));
|
||||
|
@ -338,14 +338,14 @@ public class RunningStats implements Writeable, Cloneable {
|
|||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
RunningStats that = (RunningStats) o;
|
||||
return docCount == that.docCount &&
|
||||
Objects.equals(fieldSum, that.fieldSum) &&
|
||||
Objects.equals(counts, that.counts) &&
|
||||
Objects.equals(means, that.means) &&
|
||||
Objects.equals(variances, that.variances) &&
|
||||
Objects.equals(skewness, that.skewness) &&
|
||||
Objects.equals(kurtosis, that.kurtosis) &&
|
||||
Objects.equals(covariances, that.covariances);
|
||||
return docCount == that.docCount
|
||||
&& Objects.equals(fieldSum, that.fieldSum)
|
||||
&& Objects.equals(counts, that.counts)
|
||||
&& Objects.equals(means, that.means)
|
||||
&& Objects.equals(variances, that.variances)
|
||||
&& Objects.equals(skewness, that.skewness)
|
||||
&& Objects.equals(kurtosis, that.kurtosis)
|
||||
&& Objects.equals(covariances, that.covariances);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -34,7 +34,6 @@ package org.opensearch.search.aggregations.support;
|
|||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.opensearch.index.fielddata.NumericDoubleValues;
|
||||
import org.opensearch.search.MultiValueMode;
|
||||
import org.opensearch.search.aggregations.support.ValuesSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
|
|
@ -50,13 +50,13 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
public abstract class ArrayValuesSourceAggregationBuilder<AB extends ArrayValuesSourceAggregationBuilder<AB>>
|
||||
extends AbstractAggregationBuilder<AB> {
|
||||
public abstract class ArrayValuesSourceAggregationBuilder<AB extends ArrayValuesSourceAggregationBuilder<AB>> extends
|
||||
AbstractAggregationBuilder<AB> {
|
||||
|
||||
public static final ParseField MULTIVALUE_MODE_FIELD = new ParseField("mode");
|
||||
|
||||
public abstract static class LeafOnly<AB extends ArrayValuesSourceAggregationBuilder<AB>>
|
||||
extends ArrayValuesSourceAggregationBuilder<AB> {
|
||||
public abstract static class LeafOnly<AB extends ArrayValuesSourceAggregationBuilder<AB>> extends ArrayValuesSourceAggregationBuilder<
|
||||
AB> {
|
||||
|
||||
protected LeafOnly(String name) {
|
||||
super(name);
|
||||
|
@ -65,8 +65,9 @@ public abstract class ArrayValuesSourceAggregationBuilder<AB extends ArrayValues
|
|||
protected LeafOnly(LeafOnly<AB> clone, Builder factoriesBuilder, Map<String, Object> metadata) {
|
||||
super(clone, factoriesBuilder, metadata);
|
||||
if (factoriesBuilder.count() > 0) {
|
||||
throw new AggregationInitializationException("Aggregator [" + name + "] of type ["
|
||||
+ getType() + "] cannot accept sub-aggregations");
|
||||
throw new AggregationInitializationException(
|
||||
"Aggregator [" + name + "] of type [" + getType() + "] cannot accept sub-aggregations"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -79,8 +80,9 @@ public abstract class ArrayValuesSourceAggregationBuilder<AB extends ArrayValues
|
|||
|
||||
@Override
|
||||
public AB subAggregations(Builder subFactories) {
|
||||
throw new AggregationInitializationException("Aggregator [" + name + "] of type [" +
|
||||
getType() + "] cannot accept sub-aggregations");
|
||||
throw new AggregationInitializationException(
|
||||
"Aggregator [" + name + "] of type [" + getType() + "] cannot accept sub-aggregations"
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -102,8 +104,11 @@ public abstract class ArrayValuesSourceAggregationBuilder<AB extends ArrayValues
|
|||
super(name);
|
||||
}
|
||||
|
||||
protected ArrayValuesSourceAggregationBuilder(ArrayValuesSourceAggregationBuilder<AB> clone,
|
||||
Builder factoriesBuilder, Map<String, Object> metadata) {
|
||||
protected ArrayValuesSourceAggregationBuilder(
|
||||
ArrayValuesSourceAggregationBuilder<AB> clone,
|
||||
Builder factoriesBuilder,
|
||||
Map<String, Object> metadata
|
||||
) {
|
||||
super(clone, factoriesBuilder, metadata);
|
||||
this.fields = new ArrayList<>(clone.fields);
|
||||
this.userValueTypeHint = clone.userValueTypeHint;
|
||||
|
@ -112,8 +117,7 @@ public abstract class ArrayValuesSourceAggregationBuilder<AB extends ArrayValues
|
|||
this.missing = clone.missing;
|
||||
}
|
||||
|
||||
protected ArrayValuesSourceAggregationBuilder(StreamInput in)
|
||||
throws IOException {
|
||||
protected ArrayValuesSourceAggregationBuilder(StreamInput in) throws IOException {
|
||||
super(in);
|
||||
read(in);
|
||||
}
|
||||
|
@ -123,7 +127,7 @@ public abstract class ArrayValuesSourceAggregationBuilder<AB extends ArrayValues
|
|||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
private void read(StreamInput in) throws IOException {
|
||||
fields = (ArrayList<String>)in.readGenericValue();
|
||||
fields = (ArrayList<String>) in.readGenericValue();
|
||||
userValueTypeHint = in.readOptionalWriteable(ValueType::readFromStream);
|
||||
format = in.readOptionalString();
|
||||
missingMap = in.readMap();
|
||||
|
@ -203,8 +207,11 @@ public abstract class ArrayValuesSourceAggregationBuilder<AB extends ArrayValues
|
|||
}
|
||||
|
||||
@Override
|
||||
protected final ArrayValuesSourceAggregatorFactory doBuild(QueryShardContext queryShardContext, AggregatorFactory parent,
|
||||
Builder subFactoriesBuilder) throws IOException {
|
||||
protected final ArrayValuesSourceAggregatorFactory doBuild(
|
||||
QueryShardContext queryShardContext,
|
||||
AggregatorFactory parent,
|
||||
Builder subFactoriesBuilder
|
||||
) throws IOException {
|
||||
Map<String, ValuesSourceConfig> configs = resolveConfig(queryShardContext);
|
||||
ArrayValuesSourceAggregatorFactory factory = innerBuild(queryShardContext, configs, parent, subFactoriesBuilder);
|
||||
return factory;
|
||||
|
@ -213,17 +220,27 @@ public abstract class ArrayValuesSourceAggregationBuilder<AB extends ArrayValues
|
|||
protected Map<String, ValuesSourceConfig> resolveConfig(QueryShardContext queryShardContext) {
|
||||
HashMap<String, ValuesSourceConfig> configs = new HashMap<>();
|
||||
for (String field : fields) {
|
||||
ValuesSourceConfig config = ValuesSourceConfig.resolveUnregistered(queryShardContext, userValueTypeHint, field, null,
|
||||
missingMap.get(field), null, format, CoreValuesSourceType.BYTES);
|
||||
ValuesSourceConfig config = ValuesSourceConfig.resolveUnregistered(
|
||||
queryShardContext,
|
||||
userValueTypeHint,
|
||||
field,
|
||||
null,
|
||||
missingMap.get(field),
|
||||
null,
|
||||
format,
|
||||
CoreValuesSourceType.BYTES
|
||||
);
|
||||
configs.put(field, config);
|
||||
}
|
||||
return configs;
|
||||
}
|
||||
|
||||
protected abstract ArrayValuesSourceAggregatorFactory innerBuild(QueryShardContext queryShardContext,
|
||||
Map<String, ValuesSourceConfig> configs,
|
||||
AggregatorFactory parent,
|
||||
AggregatorFactories.Builder subFactoriesBuilder) throws IOException;
|
||||
protected abstract ArrayValuesSourceAggregatorFactory innerBuild(
|
||||
QueryShardContext queryShardContext,
|
||||
Map<String, ValuesSourceConfig> configs,
|
||||
AggregatorFactory parent,
|
||||
AggregatorFactories.Builder subFactoriesBuilder
|
||||
) throws IOException;
|
||||
|
||||
@Override
|
||||
public final XContentBuilder internalXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
|
|
|
@ -43,24 +43,29 @@ import java.io.IOException;
|
|||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class ArrayValuesSourceAggregatorFactory
|
||||
extends AggregatorFactory {
|
||||
public abstract class ArrayValuesSourceAggregatorFactory extends AggregatorFactory {
|
||||
|
||||
protected Map<String, ValuesSourceConfig> configs;
|
||||
|
||||
public ArrayValuesSourceAggregatorFactory(String name, Map<String, ValuesSourceConfig> configs,
|
||||
QueryShardContext queryShardContext, AggregatorFactory parent,
|
||||
AggregatorFactories.Builder subFactoriesBuilder,
|
||||
Map<String, Object> metadata) throws IOException {
|
||||
public ArrayValuesSourceAggregatorFactory(
|
||||
String name,
|
||||
Map<String, ValuesSourceConfig> configs,
|
||||
QueryShardContext queryShardContext,
|
||||
AggregatorFactory parent,
|
||||
AggregatorFactories.Builder subFactoriesBuilder,
|
||||
Map<String, Object> metadata
|
||||
) throws IOException {
|
||||
super(name, queryShardContext, parent, subFactoriesBuilder, metadata);
|
||||
this.configs = configs;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Aggregator createInternal(SearchContext searchContext,
|
||||
Aggregator parent,
|
||||
CardinalityUpperBound cardinality,
|
||||
Map<String, Object> metadata) throws IOException {
|
||||
public Aggregator createInternal(
|
||||
SearchContext searchContext,
|
||||
Aggregator parent,
|
||||
CardinalityUpperBound cardinality,
|
||||
Map<String, Object> metadata
|
||||
) throws IOException {
|
||||
HashMap<String, ValuesSource> valuesSources = new HashMap<>();
|
||||
|
||||
for (Map.Entry<String, ValuesSourceConfig> config : configs.entrySet()) {
|
||||
|
@ -79,9 +84,8 @@ public abstract class ArrayValuesSourceAggregatorFactory
|
|||
* Create the {@linkplain Aggregator} when none of the configured
|
||||
* fields can be resolved to a {@link ValuesSource}.
|
||||
*/
|
||||
protected abstract Aggregator createUnmapped(SearchContext searchContext,
|
||||
Aggregator parent,
|
||||
Map<String, Object> metadata) throws IOException;
|
||||
protected abstract Aggregator createUnmapped(SearchContext searchContext, Aggregator parent, Map<String, Object> metadata)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Create the {@linkplain Aggregator} when any of the configured
|
||||
|
@ -91,10 +95,12 @@ public abstract class ArrayValuesSourceAggregatorFactory
|
|||
* that the {@link Aggregator} created by this method
|
||||
* will be asked to collect.
|
||||
*/
|
||||
protected abstract Aggregator doCreateInternal(Map<String, ValuesSource> valuesSources,
|
||||
SearchContext searchContext,
|
||||
Aggregator parent,
|
||||
CardinalityUpperBound cardinality,
|
||||
Map<String, Object> metadata) throws IOException;
|
||||
protected abstract Aggregator doCreateInternal(
|
||||
Map<String, ValuesSource> valuesSources,
|
||||
SearchContext searchContext,
|
||||
Aggregator parent,
|
||||
CardinalityUpperBound cardinality,
|
||||
Map<String, Object> metadata
|
||||
) throws IOException;
|
||||
|
||||
}
|
||||
|
|
|
@ -80,8 +80,7 @@ public abstract class ArrayValuesSourceParser<VS extends ValuesSource> implement
|
|||
}
|
||||
|
||||
@Override
|
||||
public final ArrayValuesSourceAggregationBuilder<?> parse(String aggregationName, XContentParser parser)
|
||||
throws IOException {
|
||||
public final ArrayValuesSourceAggregationBuilder<?> parse(String aggregationName, XContentParser parser) throws IOException {
|
||||
|
||||
List<String> fields = null;
|
||||
String format = null;
|
||||
|
@ -98,12 +97,22 @@ public abstract class ArrayValuesSourceParser<VS extends ValuesSource> implement
|
|||
} else if (formattable && CommonFields.FORMAT.match(currentFieldName, parser.getDeprecationHandler())) {
|
||||
format = parser.text();
|
||||
} else if (CommonFields.VALUE_TYPE.match(currentFieldName, parser.getDeprecationHandler())) {
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "]. " +
|
||||
"Multi-field aggregations do not support scripts.");
|
||||
throw new ParsingException(
|
||||
parser.getTokenLocation(),
|
||||
"Unexpected token "
|
||||
+ token
|
||||
+ " ["
|
||||
+ currentFieldName
|
||||
+ "] in ["
|
||||
+ aggregationName
|
||||
+ "]. "
|
||||
+ "Multi-field aggregations do not support scripts."
|
||||
);
|
||||
} else if (!token(aggregationName, currentFieldName, token, parser, otherOptions)) {
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "].");
|
||||
throw new ParsingException(
|
||||
parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "]."
|
||||
);
|
||||
}
|
||||
} else if (token == XContentParser.Token.START_OBJECT) {
|
||||
if (CommonFields.MISSING.match(currentFieldName, parser.getDeprecationHandler())) {
|
||||
|
@ -112,41 +121,69 @@ public abstract class ArrayValuesSourceParser<VS extends ValuesSource> implement
|
|||
parseMissingAndAdd(aggregationName, currentFieldName, parser, missingMap);
|
||||
}
|
||||
} else if (Script.SCRIPT_PARSE_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "]. " +
|
||||
"Multi-field aggregations do not support scripts.");
|
||||
throw new ParsingException(
|
||||
parser.getTokenLocation(),
|
||||
"Unexpected token "
|
||||
+ token
|
||||
+ " ["
|
||||
+ currentFieldName
|
||||
+ "] in ["
|
||||
+ aggregationName
|
||||
+ "]. "
|
||||
+ "Multi-field aggregations do not support scripts."
|
||||
);
|
||||
|
||||
} else if (!token(aggregationName, currentFieldName, token, parser, otherOptions)) {
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "].");
|
||||
throw new ParsingException(
|
||||
parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "]."
|
||||
);
|
||||
}
|
||||
} else if (token == XContentParser.Token.START_ARRAY) {
|
||||
if (Script.SCRIPT_PARSE_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "]. " +
|
||||
"Multi-field aggregations do not support scripts.");
|
||||
throw new ParsingException(
|
||||
parser.getTokenLocation(),
|
||||
"Unexpected token "
|
||||
+ token
|
||||
+ " ["
|
||||
+ currentFieldName
|
||||
+ "] in ["
|
||||
+ aggregationName
|
||||
+ "]. "
|
||||
+ "Multi-field aggregations do not support scripts."
|
||||
);
|
||||
} else if (CommonFields.FIELDS.match(currentFieldName, parser.getDeprecationHandler())) {
|
||||
fields = new ArrayList<>();
|
||||
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
|
||||
if (token == XContentParser.Token.VALUE_STRING) {
|
||||
fields.add(parser.text());
|
||||
} else {
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "].");
|
||||
throw new ParsingException(
|
||||
parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "]."
|
||||
);
|
||||
}
|
||||
}
|
||||
} else if (!token(aggregationName, currentFieldName, token, parser, otherOptions)) {
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "].");
|
||||
throw new ParsingException(
|
||||
parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "]."
|
||||
);
|
||||
}
|
||||
} else if (!token(aggregationName, currentFieldName, token, parser, otherOptions)) {
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "].");
|
||||
throw new ParsingException(
|
||||
parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "]."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
ArrayValuesSourceAggregationBuilder<?> factory = createFactory(aggregationName, this.valuesSourceType, this.targetValueType,
|
||||
otherOptions);
|
||||
ArrayValuesSourceAggregationBuilder<?> factory = createFactory(
|
||||
aggregationName,
|
||||
this.valuesSourceType,
|
||||
this.targetValueType,
|
||||
otherOptions
|
||||
);
|
||||
if (fields != null) {
|
||||
factory.fields(fields);
|
||||
}
|
||||
|
@ -159,8 +196,12 @@ public abstract class ArrayValuesSourceParser<VS extends ValuesSource> implement
|
|||
return factory;
|
||||
}
|
||||
|
||||
private void parseMissingAndAdd(final String aggregationName, final String currentFieldName,
|
||||
XContentParser parser, final Map<String, Object> missing) throws IOException {
|
||||
private void parseMissingAndAdd(
|
||||
final String aggregationName,
|
||||
final String currentFieldName,
|
||||
XContentParser parser,
|
||||
final Map<String, Object> missing
|
||||
) throws IOException {
|
||||
XContentParser.Token token = parser.currentToken();
|
||||
if (token == null) {
|
||||
token = parser.nextToken();
|
||||
|
@ -169,15 +210,18 @@ public abstract class ArrayValuesSourceParser<VS extends ValuesSource> implement
|
|||
if (token == XContentParser.Token.FIELD_NAME) {
|
||||
final String fieldName = parser.currentName();
|
||||
if (missing.containsKey(fieldName)) {
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Missing field [" + fieldName + "] already defined as [" + missing.get(fieldName)
|
||||
+ "] in [" + aggregationName + "].");
|
||||
throw new ParsingException(
|
||||
parser.getTokenLocation(),
|
||||
"Missing field [" + fieldName + "] already defined as [" + missing.get(fieldName) + "] in [" + aggregationName + "]."
|
||||
);
|
||||
}
|
||||
parser.nextToken();
|
||||
missing.put(fieldName, parser.objectText());
|
||||
} else {
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "]");
|
||||
throw new ParsingException(
|
||||
parser.getTokenLocation(),
|
||||
"Unexpected token " + token + " [" + currentFieldName + "] in [" + aggregationName + "]"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -199,10 +243,12 @@ public abstract class ArrayValuesSourceParser<VS extends ValuesSource> implement
|
|||
* method
|
||||
* @return the created factory
|
||||
*/
|
||||
protected abstract ArrayValuesSourceAggregationBuilder<?> createFactory(String aggregationName,
|
||||
ValuesSourceType valuesSourceType,
|
||||
ValueType targetValueType,
|
||||
Map<ParseField, Object> otherOptions);
|
||||
protected abstract ArrayValuesSourceAggregationBuilder<?> createFactory(
|
||||
String aggregationName,
|
||||
ValuesSourceType valuesSourceType,
|
||||
ValueType targetValueType,
|
||||
Map<ParseField, Object> otherOptions
|
||||
);
|
||||
|
||||
/**
|
||||
* Allows subclasses of {@link ArrayValuesSourceParser} to parse extra
|
||||
|
@ -227,6 +273,11 @@ public abstract class ArrayValuesSourceParser<VS extends ValuesSource> implement
|
|||
* @throws IOException
|
||||
* if an error occurs whilst parsing
|
||||
*/
|
||||
protected abstract boolean token(String aggregationName, String currentFieldName, XContentParser.Token token, XContentParser parser,
|
||||
Map<ParseField, Object> otherOptions) throws IOException;
|
||||
protected abstract boolean token(
|
||||
String aggregationName,
|
||||
String currentFieldName,
|
||||
XContentParser.Token token,
|
||||
XContentParser parser,
|
||||
Map<ParseField, Object> otherOptions
|
||||
) throws IOException;
|
||||
}
|
||||
|
|
|
@ -107,35 +107,35 @@ public class InternalMatrixStatsTests extends InternalAggregationTestCase<Intern
|
|||
MatrixStatsResults matrixStatsResults = instance.getResults();
|
||||
Map<String, Object> metadata = instance.getMetadata();
|
||||
switch (between(0, 3)) {
|
||||
case 0:
|
||||
name += randomAlphaOfLength(5);
|
||||
break;
|
||||
case 1:
|
||||
String[] fields = Arrays.copyOf(this.fields, this.fields.length + 1);
|
||||
fields[fields.length - 1] = "field_" + (fields.length - 1);
|
||||
double[] values = new double[fields.length];
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
values[i] = randomDouble() * 200;
|
||||
}
|
||||
runningStats = new RunningStats();
|
||||
runningStats.add(fields, values);
|
||||
break;
|
||||
case 2:
|
||||
if (matrixStatsResults == null) {
|
||||
matrixStatsResults = new MatrixStatsResults(runningStats);
|
||||
} else {
|
||||
matrixStatsResults = null;
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
default:
|
||||
if (metadata == null) {
|
||||
metadata = new HashMap<>(1);
|
||||
} else {
|
||||
metadata = new HashMap<>(instance.getMetadata());
|
||||
}
|
||||
metadata.put(randomAlphaOfLength(15), randomInt());
|
||||
break;
|
||||
case 0:
|
||||
name += randomAlphaOfLength(5);
|
||||
break;
|
||||
case 1:
|
||||
String[] fields = Arrays.copyOf(this.fields, this.fields.length + 1);
|
||||
fields[fields.length - 1] = "field_" + (fields.length - 1);
|
||||
double[] values = new double[fields.length];
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
values[i] = randomDouble() * 200;
|
||||
}
|
||||
runningStats = new RunningStats();
|
||||
runningStats.add(fields, values);
|
||||
break;
|
||||
case 2:
|
||||
if (matrixStatsResults == null) {
|
||||
matrixStatsResults = new MatrixStatsResults(runningStats);
|
||||
} else {
|
||||
matrixStatsResults = null;
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
default:
|
||||
if (metadata == null) {
|
||||
metadata = new HashMap<>(1);
|
||||
} else {
|
||||
metadata = new HashMap<>(instance.getMetadata());
|
||||
}
|
||||
metadata.put(randomAlphaOfLength(15), randomInt());
|
||||
break;
|
||||
}
|
||||
return new InternalMatrixStats(name, docCount, runningStats, matrixStatsResults, metadata);
|
||||
}
|
||||
|
@ -159,7 +159,7 @@ public class InternalMatrixStatsTests extends InternalAggregationTestCase<Intern
|
|||
double valueB = randomDouble();
|
||||
bValues.add(valueB);
|
||||
|
||||
runningStats.add(new String[]{"a", "b"}, new double[]{valueA, valueB});
|
||||
runningStats.add(new String[] { "a", "b" }, new double[] { valueA, valueB });
|
||||
if (++valuePerShardCounter == valuesPerShard) {
|
||||
shardResults.add(new InternalMatrixStats("_name", 1L, runningStats, null, Collections.emptyMap()));
|
||||
runningStats = new RunningStats();
|
||||
|
@ -176,7 +176,11 @@ public class InternalMatrixStatsTests extends InternalAggregationTestCase<Intern
|
|||
ScriptService mockScriptService = mockScriptService();
|
||||
MockBigArrays bigArrays = new MockBigArrays(new MockPageCacheRecycler(Settings.EMPTY), new NoneCircuitBreakerService());
|
||||
InternalAggregation.ReduceContext context = InternalAggregation.ReduceContext.forFinalReduction(
|
||||
bigArrays, mockScriptService, b -> {}, PipelineTree.EMPTY);
|
||||
bigArrays,
|
||||
mockScriptService,
|
||||
b -> {},
|
||||
PipelineTree.EMPTY
|
||||
);
|
||||
InternalMatrixStats reduced = (InternalMatrixStats) shardResults.get(0).reduce(shardResults, context);
|
||||
multiPassStats.assertNearlyEqual(reduced.getResults());
|
||||
}
|
||||
|
|
|
@ -54,18 +54,17 @@ import java.util.List;
|
|||
public class MatrixStatsAggregatorTests extends AggregatorTestCase {
|
||||
|
||||
public void testNoData() throws Exception {
|
||||
MappedFieldType ft =
|
||||
new NumberFieldMapper.NumberFieldType("field", NumberFieldMapper.NumberType.DOUBLE);
|
||||
MappedFieldType ft = new NumberFieldMapper.NumberFieldType("field", NumberFieldMapper.NumberType.DOUBLE);
|
||||
|
||||
try (Directory directory = newDirectory();
|
||||
RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
|
||||
try (Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
|
||||
if (randomBoolean()) {
|
||||
indexWriter.addDocument(Collections.singleton(new StringField("another_field", "value", Field.Store.NO)));
|
||||
}
|
||||
try (IndexReader reader = indexWriter.getReader()) {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
MatrixStatsAggregationBuilder aggBuilder = new MatrixStatsAggregationBuilder("my_agg")
|
||||
.fields(Collections.singletonList("field"));
|
||||
MatrixStatsAggregationBuilder aggBuilder = new MatrixStatsAggregationBuilder("my_agg").fields(
|
||||
Collections.singletonList("field")
|
||||
);
|
||||
InternalMatrixStats stats = searchAndReduce(searcher, new MatchAllDocsQuery(), aggBuilder, ft);
|
||||
assertNull(stats.getStats());
|
||||
assertEquals(0L, stats.getDocCount());
|
||||
|
@ -76,15 +75,15 @@ public class MatrixStatsAggregatorTests extends AggregatorTestCase {
|
|||
public void testUnmapped() throws Exception {
|
||||
MappedFieldType ft = new NumberFieldMapper.NumberFieldType("field", NumberFieldMapper.NumberType.DOUBLE);
|
||||
|
||||
try (Directory directory = newDirectory();
|
||||
RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
|
||||
try (Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
|
||||
if (randomBoolean()) {
|
||||
indexWriter.addDocument(Collections.singleton(new StringField("another_field", "value", Field.Store.NO)));
|
||||
}
|
||||
try (IndexReader reader = indexWriter.getReader()) {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
MatrixStatsAggregationBuilder aggBuilder = new MatrixStatsAggregationBuilder("my_agg")
|
||||
.fields(Collections.singletonList("bogus"));
|
||||
MatrixStatsAggregationBuilder aggBuilder = new MatrixStatsAggregationBuilder("my_agg").fields(
|
||||
Collections.singletonList("bogus")
|
||||
);
|
||||
InternalMatrixStats stats = searchAndReduce(searcher, new MatchAllDocsQuery(), aggBuilder, ft);
|
||||
assertNull(stats.getStats());
|
||||
assertEquals(0L, stats.getDocCount());
|
||||
|
@ -98,8 +97,7 @@ public class MatrixStatsAggregatorTests extends AggregatorTestCase {
|
|||
String fieldB = "b";
|
||||
MappedFieldType ftB = new NumberFieldMapper.NumberFieldType(fieldB, NumberFieldMapper.NumberType.DOUBLE);
|
||||
|
||||
try (Directory directory = newDirectory();
|
||||
RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
|
||||
try (Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) {
|
||||
|
||||
int numDocs = scaledRandomIntBetween(8192, 16384);
|
||||
Double[] fieldAValues = new Double[numDocs];
|
||||
|
@ -118,8 +116,9 @@ public class MatrixStatsAggregatorTests extends AggregatorTestCase {
|
|||
multiPassStats.computeStats(Arrays.asList(fieldAValues), Arrays.asList(fieldBValues));
|
||||
try (IndexReader reader = indexWriter.getReader()) {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
MatrixStatsAggregationBuilder aggBuilder = new MatrixStatsAggregationBuilder("my_agg")
|
||||
.fields(Arrays.asList(fieldA, fieldB));
|
||||
MatrixStatsAggregationBuilder aggBuilder = new MatrixStatsAggregationBuilder("my_agg").fields(
|
||||
Arrays.asList(fieldA, fieldB)
|
||||
);
|
||||
InternalMatrixStats stats = searchAndReduce(searcher, new MatchAllDocsQuery(), aggBuilder, ftA, ftB);
|
||||
multiPassStats.assertNearlyEqual(stats);
|
||||
assertTrue(MatrixAggregationInspectionHelper.hasValue(stats));
|
||||
|
|
|
@ -142,8 +142,8 @@ class MultiPassStats {
|
|||
assertTrue(nearlyEqual(kurtosis.get(fieldAKey), stats.getKurtosis(fieldAKey), 1e-4));
|
||||
assertTrue(nearlyEqual(kurtosis.get(fieldBKey), stats.getKurtosis(fieldBKey), 1e-4));
|
||||
// covariances
|
||||
assertTrue(nearlyEqual(covariances.get(fieldAKey).get(fieldBKey),stats.getCovariance(fieldAKey, fieldBKey), 1e-7));
|
||||
assertTrue(nearlyEqual(covariances.get(fieldBKey).get(fieldAKey),stats.getCovariance(fieldBKey, fieldAKey), 1e-7));
|
||||
assertTrue(nearlyEqual(covariances.get(fieldAKey).get(fieldBKey), stats.getCovariance(fieldAKey, fieldBKey), 1e-7));
|
||||
assertTrue(nearlyEqual(covariances.get(fieldBKey).get(fieldAKey), stats.getCovariance(fieldBKey, fieldAKey), 1e-7));
|
||||
// correlation
|
||||
assertTrue(nearlyEqual(correlations.get(fieldAKey).get(fieldBKey), stats.getCorrelation(fieldAKey, fieldBKey), 1e-7));
|
||||
assertTrue(nearlyEqual(correlations.get(fieldBKey).get(fieldAKey), stats.getCorrelation(fieldBKey, fieldAKey), 1e-7));
|
||||
|
@ -166,8 +166,8 @@ class MultiPassStats {
|
|||
assertTrue(nearlyEqual(kurtosis.get(fieldAKey), stats.getKurtosis(fieldAKey), 1e-4));
|
||||
assertTrue(nearlyEqual(kurtosis.get(fieldBKey), stats.getKurtosis(fieldBKey), 1e-4));
|
||||
// covariances
|
||||
assertTrue(nearlyEqual(covariances.get(fieldAKey).get(fieldBKey),stats.getCovariance(fieldAKey, fieldBKey), 1e-7));
|
||||
assertTrue(nearlyEqual(covariances.get(fieldBKey).get(fieldAKey),stats.getCovariance(fieldBKey, fieldAKey), 1e-7));
|
||||
assertTrue(nearlyEqual(covariances.get(fieldAKey).get(fieldBKey), stats.getCovariance(fieldAKey, fieldBKey), 1e-7));
|
||||
assertTrue(nearlyEqual(covariances.get(fieldBKey).get(fieldAKey), stats.getCovariance(fieldBKey, fieldAKey), 1e-7));
|
||||
// correlation
|
||||
assertTrue(nearlyEqual(correlations.get(fieldAKey).get(fieldBKey), stats.getCorrelation(fieldAKey, fieldBKey), 1e-7));
|
||||
assertTrue(nearlyEqual(correlations.get(fieldBKey).get(fieldAKey), stats.getCorrelation(fieldBKey, fieldAKey), 1e-7));
|
||||
|
|
|
@ -49,9 +49,9 @@ public class RunningStatsTests extends BaseMatrixStatsTestCase {
|
|||
int start = 0;
|
||||
RunningStats stats = null;
|
||||
List<Double> fieldAShard, fieldBShard;
|
||||
for (int s = 0; s < numShards-1; start = ++s * (int)obsPerShard) {
|
||||
fieldAShard = fieldA.subList(start, start + (int)obsPerShard);
|
||||
fieldBShard = fieldB.subList(start, start + (int)obsPerShard);
|
||||
for (int s = 0; s < numShards - 1; start = ++s * (int) obsPerShard) {
|
||||
fieldAShard = fieldA.subList(start, start + (int) obsPerShard);
|
||||
fieldBShard = fieldB.subList(start, start + (int) obsPerShard);
|
||||
if (stats == null) {
|
||||
stats = createRunningStats(fieldAShard, fieldBShard);
|
||||
} else {
|
||||
|
|
|
@ -37,7 +37,7 @@ import org.opensearch.test.rest.yaml.ClientYamlTestCandidate;
|
|||
import org.opensearch.test.rest.yaml.OpenSearchClientYamlSuiteTestCase;
|
||||
|
||||
public class MatrixStatsClientYamlTestSuiteIT extends OpenSearchClientYamlSuiteTestCase {
|
||||
public MatrixStatsClientYamlTestSuiteIT(@Name("yaml")ClientYamlTestCandidate testCandidate) {
|
||||
public MatrixStatsClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) {
|
||||
super(testCandidate);
|
||||
}
|
||||
|
||||
|
|
|
@ -55,8 +55,12 @@ public class QueryStringWithAnalyzersIT extends OpenSearchIntegTestCase {
|
|||
* Validates that we properly split fields using the word delimiter filter in query_string.
|
||||
*/
|
||||
public void testCustomWordDelimiterQueryString() {
|
||||
assertAcked(client().admin().indices().prepareCreate("test")
|
||||
.setSettings(Settings.builder()
|
||||
assertAcked(
|
||||
client().admin()
|
||||
.indices()
|
||||
.prepareCreate("test")
|
||||
.setSettings(
|
||||
Settings.builder()
|
||||
.put("analysis.analyzer.my_analyzer.type", "custom")
|
||||
.put("analysis.analyzer.my_analyzer.tokenizer", "whitespace")
|
||||
.put("analysis.analyzer.my_analyzer.filter", "custom_word_delimiter")
|
||||
|
@ -67,21 +71,17 @@ public class QueryStringWithAnalyzersIT extends OpenSearchIntegTestCase {
|
|||
.put("analysis.filter.custom_word_delimiter.catenate_words", "false")
|
||||
.put("analysis.filter.custom_word_delimiter.split_on_case_change", "false")
|
||||
.put("analysis.filter.custom_word_delimiter.split_on_numerics", "false")
|
||||
.put("analysis.filter.custom_word_delimiter.stem_english_possessive", "false"))
|
||||
.addMapping("type1",
|
||||
"field1", "type=text,analyzer=my_analyzer",
|
||||
"field2", "type=text,analyzer=my_analyzer"));
|
||||
.put("analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
|
||||
)
|
||||
.addMapping("type1", "field1", "type=text,analyzer=my_analyzer", "field2", "type=text,analyzer=my_analyzer")
|
||||
);
|
||||
|
||||
client().prepareIndex("test", "type1", "1").setSource(
|
||||
"field1", "foo bar baz",
|
||||
"field2", "not needed").get();
|
||||
client().prepareIndex("test", "type1", "1").setSource("field1", "foo bar baz", "field2", "not needed").get();
|
||||
refresh();
|
||||
|
||||
SearchResponse response = client()
|
||||
.prepareSearch("test")
|
||||
.setQuery(
|
||||
queryStringQuery("foo.baz").defaultOperator(Operator.AND)
|
||||
.field("field1").field("field2")).get();
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setQuery(queryStringQuery("foo.baz").defaultOperator(Operator.AND).field("field1").field("field2"))
|
||||
.get();
|
||||
assertHitCount(response, 1L);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,16 +45,14 @@ import org.opensearch.index.analysis.TokenFilterFactory;
|
|||
/**
|
||||
* Factory for ASCIIFoldingFilter.
|
||||
*/
|
||||
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory
|
||||
implements NormalizingTokenFilterFactory {
|
||||
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
|
||||
|
||||
public static final ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
|
||||
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
||||
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment,
|
||||
String name, Settings settings) {
|
||||
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
|
||||
}
|
||||
|
|
|
@ -43,8 +43,10 @@ import java.util.Map;
|
|||
|
||||
public class AnalysisPainlessExtension implements PainlessExtension {
|
||||
|
||||
private static final Whitelist WHITELIST =
|
||||
WhitelistLoader.loadFromResourceFiles(AnalysisPainlessExtension.class, "painless_whitelist.txt");
|
||||
private static final Whitelist WHITELIST = WhitelistLoader.loadFromResourceFiles(
|
||||
AnalysisPainlessExtension.class,
|
||||
"painless_whitelist.txt"
|
||||
);
|
||||
|
||||
@Override
|
||||
public Map<ScriptContext<?>, List<Whitelist>> getContextWhitelists() {
|
||||
|
|
|
@ -124,7 +124,7 @@ public abstract class AnalysisPredicateScript {
|
|||
AnalysisPredicateScript newInstance();
|
||||
}
|
||||
|
||||
public static final String[] PARAMETERS = new String[]{ "token" };
|
||||
public static final String[] PARAMETERS = new String[] { "token" };
|
||||
public static final ScriptContext<Factory> CONTEXT = new ScriptContext<>("analysis", Factory.class);
|
||||
|
||||
}
|
||||
|
|
|
@ -111,12 +111,12 @@ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (outputUnigrams) {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() +
|
||||
"] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecate("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
} else {
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"synonym_tokenfilters",
|
||||
"Token filter [" + name() + "] will not be usable to parse synonyms after v7.0"
|
||||
);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
|
|
|
@ -43,7 +43,7 @@ import org.opensearch.index.analysis.AbstractTokenizerFactory;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
|
||||
public class CharGroupTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
static final String MAX_TOKEN_LENGTH = "max_token_length";
|
||||
|
||||
|
@ -67,8 +67,7 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
|
|||
|
||||
if (c.length() == 1) {
|
||||
tokenizeOnChars.add((int) c.charAt(0));
|
||||
}
|
||||
else if (c.charAt(0) == '\\') {
|
||||
} else if (c.charAt(0) == '\\') {
|
||||
tokenizeOnChars.add((int) parseEscapedChar(c));
|
||||
} else {
|
||||
switch (c) {
|
||||
|
@ -98,8 +97,7 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
|
|||
int len = s.length();
|
||||
char c = s.charAt(0);
|
||||
if (c == '\\') {
|
||||
if (1 >= len)
|
||||
throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
if (1 >= len) throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
c = s.charAt(1);
|
||||
switch (c) {
|
||||
case '\\':
|
||||
|
|
|
@ -81,16 +81,16 @@ public interface CharMatcher {
|
|||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
switch (Character.getType(c)) {
|
||||
case Character.START_PUNCTUATION:
|
||||
case Character.END_PUNCTUATION:
|
||||
case Character.OTHER_PUNCTUATION:
|
||||
case Character.CONNECTOR_PUNCTUATION:
|
||||
case Character.DASH_PUNCTUATION:
|
||||
case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
case Character.START_PUNCTUATION:
|
||||
case Character.END_PUNCTUATION:
|
||||
case Character.OTHER_PUNCTUATION:
|
||||
case Character.CONNECTOR_PUNCTUATION:
|
||||
case Character.DASH_PUNCTUATION:
|
||||
case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -98,13 +98,13 @@ public interface CharMatcher {
|
|||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
switch (Character.getType(c)) {
|
||||
case Character.CURRENCY_SYMBOL:
|
||||
case Character.MATH_SYMBOL:
|
||||
case Character.OTHER_SYMBOL:
|
||||
case Character.MODIFIER_SYMBOL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
case Character.CURRENCY_SYMBOL:
|
||||
case Character.MATH_SYMBOL:
|
||||
case Character.OTHER_SYMBOL:
|
||||
case Character.MODIFIER_SYMBOL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -112,36 +112,39 @@ public interface CharMatcher {
|
|||
|
||||
final class Builder {
|
||||
private final Set<CharMatcher> matchers;
|
||||
|
||||
Builder() {
|
||||
matchers = new HashSet<>();
|
||||
}
|
||||
|
||||
public Builder or(CharMatcher matcher) {
|
||||
matchers.add(matcher);
|
||||
return this;
|
||||
}
|
||||
|
||||
public CharMatcher build() {
|
||||
switch (matchers.size()) {
|
||||
case 0:
|
||||
return new CharMatcher() {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
case 1:
|
||||
return matchers.iterator().next();
|
||||
default:
|
||||
return new CharMatcher() {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
for (CharMatcher matcher : matchers) {
|
||||
if (matcher.isTokenChar(c)) {
|
||||
return true;
|
||||
}
|
||||
case 0:
|
||||
return new CharMatcher() {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
};
|
||||
case 1:
|
||||
return matchers.iterator().next();
|
||||
default:
|
||||
return new CharMatcher() {
|
||||
@Override
|
||||
public boolean isTokenChar(int c) {
|
||||
for (CharMatcher matcher : matchers) {
|
||||
if (matcher.isTokenChar(c)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -174,12 +174,19 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
private final SetOnce<ScriptService> scriptService = new SetOnce<>();
|
||||
|
||||
@Override
|
||||
public Collection<Object> createComponents(Client client, ClusterService clusterService, ThreadPool threadPool,
|
||||
ResourceWatcherService resourceWatcherService, ScriptService scriptService,
|
||||
NamedXContentRegistry xContentRegistry, Environment environment,
|
||||
NodeEnvironment nodeEnvironment, NamedWriteableRegistry namedWriteableRegistry,
|
||||
IndexNameExpressionResolver expressionResolver,
|
||||
Supplier<RepositoriesService> repositoriesServiceSupplier) {
|
||||
public Collection<Object> createComponents(
|
||||
Client client,
|
||||
ClusterService clusterService,
|
||||
ThreadPool threadPool,
|
||||
ResourceWatcherService resourceWatcherService,
|
||||
ScriptService scriptService,
|
||||
NamedXContentRegistry xContentRegistry,
|
||||
Environment environment,
|
||||
NodeEnvironment nodeEnvironment,
|
||||
NamedWriteableRegistry namedWriteableRegistry,
|
||||
IndexNameExpressionResolver expressionResolver,
|
||||
Supplier<RepositoriesService> repositoriesServiceSupplier
|
||||
) {
|
||||
this.scriptService.set(scriptService);
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
@ -253,8 +260,10 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
filters.put("classic", ClassicFilterFactory::new);
|
||||
filters.put("czech_stem", CzechStemTokenFilterFactory::new);
|
||||
filters.put("common_grams", requiresAnalysisSettings(CommonGramsTokenFilterFactory::new));
|
||||
filters.put("condition",
|
||||
requiresAnalysisSettings((i, e, n, s) -> new ScriptedConditionTokenFilterFactory(i, n, s, scriptService.get())));
|
||||
filters.put(
|
||||
"condition",
|
||||
requiresAnalysisSettings((i, e, n, s) -> new ScriptedConditionTokenFilterFactory(i, n, s, scriptService.get()))
|
||||
);
|
||||
filters.put("decimal_digit", DecimalDigitFilterFactory::new);
|
||||
filters.put("delimited_payload_filter", LegacyDelimitedPayloadTokenFilterFactory::new);
|
||||
filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new);
|
||||
|
@ -262,9 +271,11 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
filters.put("dutch_stem", DutchStemTokenFilterFactory::new);
|
||||
filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
|
||||
filters.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
|
||||
deprecationLogger.deprecate("edgeNGram_deprecation",
|
||||
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [edge_ngram] instead.");
|
||||
deprecationLogger.deprecate(
|
||||
"edgeNGram_deprecation",
|
||||
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [edge_ngram] instead."
|
||||
);
|
||||
return new EdgeNGramTokenFilterFactory(indexSettings, environment, name, settings);
|
||||
});
|
||||
filters.put("elision", requiresAnalysisSettings(ElisionTokenFilterFactory::new));
|
||||
|
@ -287,17 +298,21 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
filters.put("multiplexer", MultiplexerTokenFilterFactory::new);
|
||||
filters.put("ngram", NGramTokenFilterFactory::new);
|
||||
filters.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
|
||||
deprecationLogger.deprecate("nGram_deprecation",
|
||||
"The [nGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [ngram] instead.");
|
||||
deprecationLogger.deprecate(
|
||||
"nGram_deprecation",
|
||||
"The [nGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [ngram] instead."
|
||||
);
|
||||
return new NGramTokenFilterFactory(indexSettings, environment, name, settings);
|
||||
});
|
||||
filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
|
||||
filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
|
||||
filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
|
||||
filters.put("porter_stem", PorterStemTokenFilterFactory::new);
|
||||
filters.put("predicate_token_filter",
|
||||
requiresAnalysisSettings((i, e, n, s) -> new PredicateTokenFilterScriptFactory(i, n, s, scriptService.get())));
|
||||
filters.put(
|
||||
"predicate_token_filter",
|
||||
requiresAnalysisSettings((i, e, n, s) -> new PredicateTokenFilterScriptFactory(i, n, s, scriptService.get()))
|
||||
);
|
||||
filters.put("remove_duplicates", RemoveDuplicatesTokenFilterFactory::new);
|
||||
filters.put("reverse", ReverseTokenFilterFactory::new);
|
||||
filters.put("russian_stem", RussianStemTokenFilterFactory::new);
|
||||
|
@ -336,18 +351,22 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
tokenizers.put("thai", ThaiTokenizerFactory::new);
|
||||
tokenizers.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_6_0)) {
|
||||
deprecationLogger.deprecate("nGram_tokenizer_deprecation",
|
||||
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to [ngram] instead.");
|
||||
deprecationLogger.deprecate(
|
||||
"nGram_tokenizer_deprecation",
|
||||
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to [ngram] instead."
|
||||
);
|
||||
}
|
||||
return new NGramTokenizerFactory(indexSettings, environment, name, settings);
|
||||
});
|
||||
tokenizers.put("ngram", NGramTokenizerFactory::new);
|
||||
tokenizers.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_6_0)) {
|
||||
deprecationLogger.deprecate("edgeNGram_tokenizer_deprecation",
|
||||
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to [edge_ngram] instead.");
|
||||
deprecationLogger.deprecate(
|
||||
"edgeNGram_tokenizer_deprecation",
|
||||
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to [edge_ngram] instead."
|
||||
);
|
||||
}
|
||||
return new EdgeNGramTokenizerFactory(indexSettings, environment, name, settings);
|
||||
});
|
||||
|
@ -370,13 +389,27 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactories() {
|
||||
List<PreBuiltAnalyzerProviderFactory> analyzers = new ArrayList<>();
|
||||
// TODO remove in 8.0
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("standard_html_strip", CachingStrategy.OPENSEARCH,
|
||||
() -> new StandardHtmlStripAnalyzer(CharArraySet.EMPTY_SET)));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("pattern", CachingStrategy.OPENSEARCH,
|
||||
() -> new PatternAnalyzer(Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true,
|
||||
CharArraySet.EMPTY_SET)));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("snowball", CachingStrategy.LUCENE,
|
||||
() -> new SnowballAnalyzer("English", EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
|
||||
analyzers.add(
|
||||
new PreBuiltAnalyzerProviderFactory(
|
||||
"standard_html_strip",
|
||||
CachingStrategy.OPENSEARCH,
|
||||
() -> new StandardHtmlStripAnalyzer(CharArraySet.EMPTY_SET)
|
||||
)
|
||||
);
|
||||
analyzers.add(
|
||||
new PreBuiltAnalyzerProviderFactory(
|
||||
"pattern",
|
||||
CachingStrategy.OPENSEARCH,
|
||||
() -> new PatternAnalyzer(Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true, CharArraySet.EMPTY_SET)
|
||||
)
|
||||
);
|
||||
analyzers.add(
|
||||
new PreBuiltAnalyzerProviderFactory(
|
||||
"snowball",
|
||||
CachingStrategy.LUCENE,
|
||||
() -> new SnowballAnalyzer("English", EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)
|
||||
)
|
||||
);
|
||||
|
||||
// Language analyzers:
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("arabic", CachingStrategy.LUCENE, ArabicAnalyzer::new));
|
||||
|
@ -387,8 +420,13 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
analyzers.add(new PreBuiltAnalyzerProviderFactory("bulgarian", CachingStrategy.LUCENE, BulgarianAnalyzer::new));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("catalan", CachingStrategy.LUCENE, CatalanAnalyzer::new));
|
||||
// chinese analyzer: only for old indices, best effort
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("chinese", CachingStrategy.ONE,
|
||||
() -> new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
|
||||
analyzers.add(
|
||||
new PreBuiltAnalyzerProviderFactory(
|
||||
"chinese",
|
||||
CachingStrategy.ONE,
|
||||
() -> new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)
|
||||
)
|
||||
);
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("cjk", CachingStrategy.LUCENE, CJKAnalyzer::new));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("czech", CachingStrategy.LUCENE, CzechAnalyzer::new));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("danish", CachingStrategy.LUCENE, DanishAnalyzer::new));
|
||||
|
@ -426,9 +464,11 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
filters.add(PreConfiguredCharFilter.singleton("html_strip", false, HTMLStripCharFilter::new));
|
||||
filters.add(PreConfiguredCharFilter.openSearchVersion("htmlStrip", false, (reader, version) -> {
|
||||
if (version.onOrAfter(LegacyESVersion.V_6_3_0)) {
|
||||
deprecationLogger.deprecate("htmlStrip_deprecation",
|
||||
"The [htmpStrip] char filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [html_strip] instead.");
|
||||
deprecationLogger.deprecate(
|
||||
"htmlStrip_deprecation",
|
||||
"The [htmpStrip] char filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [html_strip] instead."
|
||||
);
|
||||
}
|
||||
return new HTMLStripCharFilter(reader);
|
||||
}));
|
||||
|
@ -447,44 +487,60 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, false,
|
||||
input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
|
||||
filters.add(
|
||||
PreConfiguredTokenFilter.singleton("common_grams", false, false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))
|
||||
);
|
||||
filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.openSearchVersion("delimited_payload_filter", false, (input, version) -> {
|
||||
if (version.onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException(
|
||||
"[delimited_payload_filter] is not supported for new indices, use [delimited_payload] instead");
|
||||
"[delimited_payload_filter] is not supported for new indices, use [delimited_payload] instead"
|
||||
);
|
||||
}
|
||||
if (version.onOrAfter(LegacyESVersion.V_6_2_0)) {
|
||||
deprecationLogger.deprecate("analysis_delimited_payload_filter",
|
||||
"Deprecated [delimited_payload_filter] used, replaced by [delimited_payload]");
|
||||
deprecationLogger.deprecate(
|
||||
"analysis_delimited_payload_filter",
|
||||
"Deprecated [delimited_payload_filter] used, replaced by [delimited_payload]"
|
||||
);
|
||||
}
|
||||
return new DelimitedPayloadTokenFilter(input,
|
||||
return new DelimitedPayloadTokenFilter(
|
||||
input,
|
||||
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
|
||||
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER);
|
||||
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER
|
||||
);
|
||||
}));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input ->
|
||||
new DelimitedPayloadTokenFilter(input,
|
||||
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
|
||||
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
|
||||
filters.add(
|
||||
PreConfiguredTokenFilter.singleton(
|
||||
"delimited_payload",
|
||||
false,
|
||||
input -> new DelimitedPayloadTokenFilter(
|
||||
input,
|
||||
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
|
||||
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER
|
||||
)
|
||||
)
|
||||
);
|
||||
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input ->
|
||||
new EdgeNGramTokenFilter(input, 1)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input -> new EdgeNGramTokenFilter(input, 1)));
|
||||
filters.add(PreConfiguredTokenFilter.openSearchVersion("edgeNGram", false, false, (reader, version) -> {
|
||||
if (version.onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException(
|
||||
"The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
|
||||
+ "Please change the filter name to [edge_ngram] instead.");
|
||||
"The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
|
||||
+ "Please change the filter name to [edge_ngram] instead."
|
||||
);
|
||||
} else {
|
||||
deprecationLogger.deprecate("edgeNGram_deprecation",
|
||||
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [edge_ngram] instead.");
|
||||
deprecationLogger.deprecate(
|
||||
"edgeNGram_deprecation",
|
||||
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [edge_ngram] instead."
|
||||
);
|
||||
}
|
||||
return new EdgeNGramTokenFilter(reader, 1);
|
||||
}));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("elision", true,
|
||||
input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
|
||||
}));
|
||||
filters.add(
|
||||
PreConfiguredTokenFilter.singleton("elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))
|
||||
);
|
||||
filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
|
||||
|
@ -494,19 +550,30 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
|
||||
// TODO this one seems useless
|
||||
filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("limit", false, input ->
|
||||
new LimitTokenCountFilter(input,
|
||||
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
|
||||
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
|
||||
filters.add(
|
||||
PreConfiguredTokenFilter.singleton(
|
||||
"limit",
|
||||
false,
|
||||
input -> new LimitTokenCountFilter(
|
||||
input,
|
||||
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
|
||||
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS
|
||||
)
|
||||
)
|
||||
);
|
||||
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
|
||||
filters.add(PreConfiguredTokenFilter.openSearchVersion("nGram", false, false, (reader, version) -> {
|
||||
if (version.onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
|
||||
+ "Please change the filter name to [ngram] instead.");
|
||||
throw new IllegalArgumentException(
|
||||
"The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
|
||||
+ "Please change the filter name to [ngram] instead."
|
||||
);
|
||||
} else {
|
||||
deprecationLogger.deprecate("nGram_deprecation",
|
||||
"The [nGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [ngram] instead.");
|
||||
deprecationLogger.deprecate(
|
||||
"nGram_deprecation",
|
||||
"The [nGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [ngram] instead."
|
||||
);
|
||||
}
|
||||
return new NGramTokenFilter(reader, 1, 2, false);
|
||||
}));
|
||||
|
@ -531,28 +598,39 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
|
||||
// The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
|
||||
filters.add(PreConfiguredTokenFilter.singleton("stop", false,
|
||||
input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
|
||||
filters.add(
|
||||
PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET))
|
||||
);
|
||||
filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, false, input ->
|
||||
new WordDelimiterFilter(input,
|
||||
WordDelimiterFilter.GENERATE_WORD_PARTS
|
||||
| WordDelimiterFilter.GENERATE_NUMBER_PARTS
|
||||
| WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
|
||||
| WordDelimiterFilter.SPLIT_ON_NUMERICS
|
||||
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
|
||||
filters.add(
|
||||
PreConfiguredTokenFilter.singleton(
|
||||
"word_delimiter",
|
||||
false,
|
||||
false,
|
||||
input -> new WordDelimiterFilter(
|
||||
input,
|
||||
WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS
|
||||
| WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS
|
||||
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE,
|
||||
null
|
||||
)
|
||||
)
|
||||
);
|
||||
filters.add(PreConfiguredTokenFilter.openSearchVersion("word_delimiter_graph", false, false, (input, version) -> {
|
||||
boolean adjustOffsets = version.onOrAfter(LegacyESVersion.V_7_3_0);
|
||||
return new WordDelimiterGraphFilter(input, adjustOffsets, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
|
||||
WordDelimiterGraphFilter.GENERATE_WORD_PARTS
|
||||
| WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
|
||||
| WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
|
||||
| WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
|
||||
| WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null);
|
||||
return new WordDelimiterGraphFilter(
|
||||
input,
|
||||
adjustOffsets,
|
||||
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
|
||||
WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
|
||||
| WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
|
||||
| WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE,
|
||||
null
|
||||
);
|
||||
}));
|
||||
return filters;
|
||||
}
|
||||
|
@ -582,17 +660,21 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
// Temporary shim for aliases. TODO deprecate after they are moved
|
||||
tokenizers.add(PreConfiguredTokenizer.openSearchVersion("nGram", (version) -> {
|
||||
if (version.onOrAfter(LegacyESVersion.V_7_6_0)) {
|
||||
deprecationLogger.deprecate("nGram_tokenizer_deprecation",
|
||||
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to [ngram] instead.");
|
||||
deprecationLogger.deprecate(
|
||||
"nGram_tokenizer_deprecation",
|
||||
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to [ngram] instead."
|
||||
);
|
||||
}
|
||||
return new NGramTokenizer();
|
||||
}));
|
||||
tokenizers.add(PreConfiguredTokenizer.openSearchVersion("edgeNGram", (version) -> {
|
||||
if (version.onOrAfter(LegacyESVersion.V_7_6_0)) {
|
||||
deprecationLogger.deprecate("edgeNGram_tokenizer_deprecation",
|
||||
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to [edge_ngram] instead.");
|
||||
deprecationLogger.deprecate(
|
||||
"edgeNGram_tokenizer_deprecation",
|
||||
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to [edge_ngram] instead."
|
||||
);
|
||||
}
|
||||
if (version.onOrAfter(LegacyESVersion.V_7_3_0)) {
|
||||
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
|
||||
|
|
|
@ -47,7 +47,7 @@ import org.opensearch.index.analysis.TokenFilterFactory;
|
|||
|
||||
public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(CommonGramsTokenFilterFactory.class);
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(CommonGramsTokenFilterFactory.class);
|
||||
|
||||
private final CharArraySet words;
|
||||
|
||||
|
@ -63,7 +63,8 @@ public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
|
||||
if (this.words == null) {
|
||||
throw new IllegalArgumentException(
|
||||
"missing or empty [common_words] or [common_words_path] configuration for common_grams token filter");
|
||||
"missing or empty [common_words] or [common_words_path] configuration for common_grams token filter"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -82,11 +83,12 @@ public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
} else {
|
||||
DEPRECATION_LOGGER.deprecate("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"synonym_tokenfilters",
|
||||
"Token filter [" + name() + "] will not be usable to parse synonyms after v7.0"
|
||||
);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -38,7 +38,6 @@ import org.opensearch.common.settings.Settings;
|
|||
import org.opensearch.env.Environment;
|
||||
import org.opensearch.index.IndexSettings;
|
||||
|
||||
|
||||
/**
|
||||
* Uses the {@link org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter} to decompound tokens using a dictionary.
|
||||
*
|
||||
|
@ -52,7 +51,6 @@ public class DictionaryCompoundWordTokenFilterFactory extends AbstractCompoundWo
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new DictionaryCompoundWordTokenFilter(tokenStream, wordList, minWordSize,
|
||||
minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
return new DictionaryCompoundWordTokenFilter(tokenStream, wordList, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,10 +43,9 @@ import org.opensearch.index.IndexSettings;
|
|||
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.opensearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
|
||||
public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(EdgeNGramTokenFilterFactory.class);
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(EdgeNGramTokenFilterFactory.class);
|
||||
|
||||
private final int minGram;
|
||||
|
||||
|
@ -67,10 +66,13 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
}
|
||||
|
||||
static int parseSide(String side) {
|
||||
switch(side) {
|
||||
case "front": return SIDE_FRONT;
|
||||
case "back": return SIDE_BACK;
|
||||
default: throw new IllegalArgumentException("invalid side: " + side);
|
||||
switch (side) {
|
||||
case "front":
|
||||
return SIDE_FRONT;
|
||||
case "back":
|
||||
return SIDE_BACK;
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid side: " + side);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -102,10 +104,11 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecate("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
} else {
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"synonym_tokenfilters",
|
||||
"Token filter [" + name() + "] will not be usable to parse synonyms after v7.0"
|
||||
);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,5 +54,7 @@ public class EstonianAnalyzerProvider extends AbstractIndexAnalyzerProvider<Esto
|
|||
}
|
||||
|
||||
@Override
|
||||
public EstonianAnalyzer get() { return this.analyzer; }
|
||||
public EstonianAnalyzer get() {
|
||||
return this.analyzer;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,7 +41,6 @@ import org.opensearch.index.IndexSettings;
|
|||
import org.opensearch.index.analysis.AbstractIndexAnalyzerProvider;
|
||||
import org.opensearch.index.analysis.Analysis;
|
||||
|
||||
|
||||
/**
|
||||
* Builds an OpenRefine Fingerprint analyzer. Uses the default settings from the various components
|
||||
* (Standard Tokenizer and lowercase + stop + fingerprint + ascii-folding filters)
|
||||
|
@ -53,7 +52,7 @@ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<A
|
|||
|
||||
public static int DEFAULT_MAX_OUTPUT_SIZE = 255;
|
||||
public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
|
||||
public static final char DEFAULT_SEPARATOR = ' ';
|
||||
public static final char DEFAULT_SEPARATOR = ' ';
|
||||
|
||||
private final FingerprintAnalyzer analyzer;
|
||||
|
||||
|
@ -61,7 +60,7 @@ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<A
|
|||
super(indexSettings, name, settings);
|
||||
|
||||
char separator = parseSeparator(settings);
|
||||
int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
|
||||
int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(), DEFAULT_MAX_OUTPUT_SIZE);
|
||||
CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS);
|
||||
|
||||
this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize);
|
||||
|
@ -80,7 +79,8 @@ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<A
|
|||
return customSeparator.charAt(0);
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Setting [separator] must be a single, non-null character. ["
|
||||
+ customSeparator + "] was provided.");
|
||||
throw new IllegalArgumentException(
|
||||
"Setting [separator] must be a single, non-null character. [" + customSeparator + "] was provided."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,7 +47,7 @@ import static org.opensearch.analysis.common.FingerprintAnalyzerProvider.MAX_OUT
|
|||
|
||||
public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(FingerprintTokenFilterFactory.class);
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(FingerprintTokenFilterFactory.class);
|
||||
|
||||
private final char separator;
|
||||
private final int maxOutputSize;
|
||||
|
@ -69,10 +69,11 @@ public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecate("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
} else {
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"synonym_tokenfilters",
|
||||
"Token filter [" + name() + "] will not be usable to parse synonyms after v7.0"
|
||||
);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,8 +45,7 @@ public class GreekAnalyzerProvider extends AbstractIndexAnalyzerProvider<GreekAn
|
|||
|
||||
GreekAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
analyzer = new GreekAnalyzer(
|
||||
Analysis.parseStopWords(env, settings, GreekAnalyzer.getDefaultStopSet()));
|
||||
analyzer = new GreekAnalyzer(Analysis.parseStopWords(env, settings, GreekAnalyzer.getDefaultStopSet()));
|
||||
analyzer.setVersion(version);
|
||||
}
|
||||
|
||||
|
|
|
@ -73,7 +73,14 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new HyphenationCompoundWordTokenFilter(tokenStream, hyphenationTree, wordList, minWordSize,
|
||||
minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
return new HyphenationCompoundWordTokenFilter(
|
||||
tokenStream,
|
||||
hyphenationTree,
|
||||
wordList,
|
||||
minWordSize,
|
||||
minSubwordSize,
|
||||
maxSubwordSize,
|
||||
onlyLongestMatch
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -64,7 +64,8 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
|
|||
static final String KEEP_TYPES_MODE_KEY = "mode";
|
||||
|
||||
enum KeepTypesMode {
|
||||
INCLUDE, EXCLUDE;
|
||||
INCLUDE,
|
||||
EXCLUDE;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
@ -78,8 +79,15 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
|
|||
} else if (lc.equals("exclude")) {
|
||||
return EXCLUDE;
|
||||
} else {
|
||||
throw new IllegalArgumentException("`keep_types` tokenfilter mode can only be [" + KeepTypesMode.INCLUDE + "] or ["
|
||||
+ KeepTypesMode.EXCLUDE + "] but was [" + modeString + "].");
|
||||
throw new IllegalArgumentException(
|
||||
"`keep_types` tokenfilter mode can only be ["
|
||||
+ KeepTypesMode.INCLUDE
|
||||
+ "] or ["
|
||||
+ KeepTypesMode.EXCLUDE
|
||||
+ "] but was ["
|
||||
+ modeString
|
||||
+ "]."
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -80,8 +80,9 @@ public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
|
|||
final String keepWordsPath = settings.get(KEEP_WORDS_PATH_KEY, null);
|
||||
if ((arrayKeepWords == null && keepWordsPath == null) || (arrayKeepWords != null && keepWordsPath != null)) {
|
||||
// we don't allow both or none
|
||||
throw new IllegalArgumentException("keep requires either `" + KEEP_WORDS_KEY + "` or `"
|
||||
+ KEEP_WORDS_PATH_KEY + "` to be configured");
|
||||
throw new IllegalArgumentException(
|
||||
"keep requires either `" + KEEP_WORDS_KEY + "` or `" + KEEP_WORDS_PATH_KEY + "` to be configured"
|
||||
);
|
||||
}
|
||||
if (settings.get(ENABLE_POS_INC_KEY) != null) {
|
||||
throw new IllegalArgumentException(ENABLE_POS_INC_KEY + " is not supported anymore. Please fix your analysis chain");
|
||||
|
|
|
@ -68,15 +68,13 @@ public class KeywordMarkerTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
KeywordMarkerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
boolean ignoreCase =
|
||||
settings.getAsBoolean("ignore_case", false);
|
||||
boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
|
||||
String patternString = settings.get("keywords_pattern");
|
||||
if (patternString != null) {
|
||||
// a pattern for matching keywords is specified, as opposed to a
|
||||
// set of keyword strings to match against
|
||||
if (settings.get("keywords") != null || settings.get("keywords_path") != null) {
|
||||
throw new IllegalArgumentException(
|
||||
"cannot specify both `keywords_pattern` and `keywords` or `keywords_path`");
|
||||
throw new IllegalArgumentException("cannot specify both `keywords_pattern` and `keywords` or `keywords_path`");
|
||||
}
|
||||
keywordPattern = Pattern.compile(patternString);
|
||||
keywordLookup = null;
|
||||
|
@ -84,8 +82,8 @@ public class KeywordMarkerTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
Set<?> rules = Analysis.getWordSet(env, settings, "keywords");
|
||||
if (rules == null) {
|
||||
throw new IllegalArgumentException(
|
||||
"keyword filter requires either `keywords`, `keywords_path`, " +
|
||||
"or `keywords_pattern` to be configured");
|
||||
"keyword filter requires either `keywords`, `keywords_path`, " + "or `keywords_pattern` to be configured"
|
||||
);
|
||||
}
|
||||
// a set of keywords (or a path to them) is specified
|
||||
keywordLookup = new CharArraySet(rules, ignoreCase);
|
||||
|
|
|
@ -40,18 +40,20 @@ import org.opensearch.index.IndexSettings;
|
|||
|
||||
public class LegacyDelimitedPayloadTokenFilterFactory extends DelimitedPayloadTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger deprecationLogger =
|
||||
DeprecationLogger.getLogger(LegacyDelimitedPayloadTokenFilterFactory.class);
|
||||
private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(LegacyDelimitedPayloadTokenFilterFactory.class);
|
||||
|
||||
LegacyDelimitedPayloadTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, env, name, settings);
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException(
|
||||
"[delimited_payload_filter] is not supported for new indices, use [delimited_payload] instead");
|
||||
"[delimited_payload_filter] is not supported for new indices, use [delimited_payload] instead"
|
||||
);
|
||||
}
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_6_2_0)) {
|
||||
deprecationLogger.deprecate("analysis_legacy_delimited_payload_filter",
|
||||
"Deprecated [delimited_payload_filter] used, replaced by [delimited_payload]");
|
||||
deprecationLogger.deprecate(
|
||||
"analysis_legacy_delimited_payload_filter",
|
||||
"Deprecated [delimited_payload_filter] used, replaced by [delimited_payload]"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -77,5 +77,3 @@ public class LowerCaseTokenFilterFactory extends AbstractTokenFilterFactory impl
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -77,12 +77,10 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen
|
|||
private void parseRules(List<String> rules, NormalizeCharMap.Builder map) {
|
||||
for (String rule : rules) {
|
||||
Matcher m = rulePattern.matcher(rule);
|
||||
if (!m.find())
|
||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
|
||||
if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
|
||||
String lhs = parseString(m.group(1).trim());
|
||||
String rhs = parseString(m.group(2).trim());
|
||||
if (lhs == null || rhs == null)
|
||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal mapping.");
|
||||
if (lhs == null || rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal mapping.");
|
||||
map.add(lhs, rhs);
|
||||
}
|
||||
}
|
||||
|
@ -96,8 +94,7 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen
|
|||
while (readPos < len) {
|
||||
char c = s.charAt(readPos++);
|
||||
if (c == '\\') {
|
||||
if (readPos >= len)
|
||||
throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
if (readPos >= len) throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
c = s.charAt(readPos++);
|
||||
switch (c) {
|
||||
case '\\':
|
||||
|
@ -119,8 +116,7 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen
|
|||
c = '\f';
|
||||
break;
|
||||
case 'u':
|
||||
if (readPos + 3 >= len)
|
||||
throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
if (readPos + 3 >= len) throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
|
||||
readPos += 4;
|
||||
break;
|
||||
|
|
|
@ -56,7 +56,7 @@ import java.util.function.Function;
|
|||
|
||||
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(MultiplexerTokenFilterFactory.class);
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(MultiplexerTokenFilterFactory.class);
|
||||
|
||||
private List<String> filterNames;
|
||||
private final boolean preserveOriginal;
|
||||
|
@ -76,22 +76,27 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
if (preserveOriginal) {
|
||||
DEPRECATION_LOGGER.deprecate("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"synonym_tokenfilters",
|
||||
"Token filter [" + name() + "] will not be usable to parse synonyms after v7.0"
|
||||
);
|
||||
return IDENTITY_FILTER;
|
||||
}
|
||||
throw new IllegalArgumentException("Token filter [" + name()
|
||||
+ "] cannot be used to parse synonyms unless [preserve_original] is [true]");
|
||||
throw new IllegalArgumentException(
|
||||
"Token filter [" + name() + "] cannot be used to parse synonyms unless [preserve_original] is [true]"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(
|
||||
TokenizerFactory tokenizer,
|
||||
List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters
|
||||
) {
|
||||
List<TokenFilterFactory> filters = new ArrayList<>();
|
||||
if (preserveOriginal) {
|
||||
filters.add(IDENTITY_FILTER);
|
||||
|
@ -139,15 +144,17 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
if (preserveOriginal) {
|
||||
DEPRECATION_LOGGER.deprecate("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"synonym_tokenfilters",
|
||||
"Token filter [" + name() + "] will not be usable to parse synonyms after v7.0"
|
||||
);
|
||||
return IDENTITY_FILTER;
|
||||
}
|
||||
throw new IllegalArgumentException("Token filter [" + name()
|
||||
+ "] cannot be used to parse synonyms unless [preserve_original] is [true]");
|
||||
throw new IllegalArgumentException(
|
||||
"Token filter [" + name() + "] cannot be used to parse synonyms unless [preserve_original] is [true]"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -42,10 +42,9 @@ import org.opensearch.index.IndexSettings;
|
|||
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.opensearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
|
||||
public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(NGramTokenFilterFactory.class);
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(NGramTokenFilterFactory.class);
|
||||
|
||||
private final int minGram;
|
||||
private final int maxGram;
|
||||
|
@ -62,12 +61,21 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException(
|
||||
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
|
||||
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
|
||||
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
|
||||
+ maxAllowedNgramDiff
|
||||
+ "] but was ["
|
||||
+ ngramDiff
|
||||
+ "]. This limit can be set by changing the ["
|
||||
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey()
|
||||
+ "] index level setting."
|
||||
);
|
||||
} else {
|
||||
deprecationLogger.deprecate("ngram_big_difference",
|
||||
deprecationLogger.deprecate(
|
||||
"ngram_big_difference",
|
||||
"Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
|
||||
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
|
||||
+ "expected difference must be less than or equal to: ["
|
||||
+ maxAllowedNgramDiff
|
||||
+ "]"
|
||||
);
|
||||
}
|
||||
}
|
||||
preserveOriginal = settings.getAsBoolean(PRESERVE_ORIG_KEY, false);
|
||||
|
@ -82,10 +90,11 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecate("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
} else {
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"synonym_tokenfilters",
|
||||
"Token filter [" + name() + "] will not be usable to parse synonyms after v7.0"
|
||||
);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -71,9 +71,9 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
|
|||
// Populate with unicode categories from java.lang.Character
|
||||
for (Field field : Character.class.getFields()) {
|
||||
if (!field.getName().startsWith("DIRECTIONALITY")
|
||||
&& Modifier.isPublic(field.getModifiers())
|
||||
&& Modifier.isStatic(field.getModifiers())
|
||||
&& field.getType() == byte.class) {
|
||||
&& Modifier.isPublic(field.getModifiers())
|
||||
&& Modifier.isStatic(field.getModifiers())
|
||||
&& field.getType() == byte.class) {
|
||||
try {
|
||||
matchers.put(field.getName().toLowerCase(Locale.ROOT), CharMatcher.ByUnicodeCategory.of(field.getByte(null)));
|
||||
} catch (Exception e) {
|
||||
|
@ -96,8 +96,14 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
|
|||
CharMatcher matcher = MATCHERS.get(characterClass);
|
||||
if (matcher == null) {
|
||||
if (characterClass.equals("custom") == false) {
|
||||
throw new IllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + Stream
|
||||
.of(MATCHERS.keySet(), Collections.singleton("custom")).flatMap(x -> x.stream()).collect(Collectors.toSet()));
|
||||
throw new IllegalArgumentException(
|
||||
"Unknown token type: '"
|
||||
+ characterClass
|
||||
+ "', must be one of "
|
||||
+ Stream.of(MATCHERS.keySet(), Collections.singleton("custom"))
|
||||
.flatMap(x -> x.stream())
|
||||
.collect(Collectors.toSet())
|
||||
);
|
||||
}
|
||||
String customCharacters = settings.get("custom_token_chars");
|
||||
if (customCharacters == null) {
|
||||
|
@ -128,12 +134,21 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
|
|||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException(
|
||||
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
|
||||
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
|
||||
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
|
||||
+ maxAllowedNgramDiff
|
||||
+ "] but was ["
|
||||
+ ngramDiff
|
||||
+ "]. This limit can be set by changing the ["
|
||||
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey()
|
||||
+ "] index level setting."
|
||||
);
|
||||
} else {
|
||||
deprecationLogger.deprecate("ngram_big_difference",
|
||||
deprecationLogger.deprecate(
|
||||
"ngram_big_difference",
|
||||
"Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
|
||||
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
|
||||
+ "expected difference must be less than or equal to: ["
|
||||
+ maxAllowedNgramDiff
|
||||
+ "]"
|
||||
);
|
||||
}
|
||||
}
|
||||
this.matcher = parseTokenChars(settings);
|
||||
|
|
|
@ -31,7 +31,6 @@
|
|||
|
||||
package org.opensearch.analysis.common;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.pattern.PatternCaptureGroupTokenFilter;
|
||||
|
|
|
@ -50,5 +50,3 @@ public class PorterStemTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
return new PorterStemFilter(tokenStream);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -58,8 +58,7 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
|
|||
private final AnalysisPredicateScript.Factory factory;
|
||||
private final List<String> filterNames;
|
||||
|
||||
ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name,
|
||||
Settings settings, ScriptService scriptService) {
|
||||
ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name, Settings settings, ScriptService scriptService) {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
Settings scriptSettings = settings.getAsSettings("script");
|
||||
|
@ -81,16 +80,20 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
|
|||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(
|
||||
TokenizerFactory tokenizer,
|
||||
List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters
|
||||
) {
|
||||
List<TokenFilterFactory> filters = new ArrayList<>();
|
||||
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
|
||||
for (String filter : filterNames) {
|
||||
TokenFilterFactory tff = allFilters.apply(filter);
|
||||
if (tff == null) {
|
||||
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
|
||||
"] refers to undefined token filter [" + filter + "]");
|
||||
throw new IllegalArgumentException(
|
||||
"ScriptedConditionTokenFilter [" + name() + "] refers to undefined token filter [" + filter + "]"
|
||||
);
|
||||
}
|
||||
tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
|
||||
filters.add(tff);
|
||||
|
@ -121,8 +124,7 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
|
|||
private final AnalysisPredicateScript script;
|
||||
private final AnalysisPredicateScript.Token token;
|
||||
|
||||
ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory,
|
||||
AnalysisPredicateScript script) {
|
||||
ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory, AnalysisPredicateScript script) {
|
||||
super(input, inputFactory);
|
||||
this.script = script;
|
||||
this.token = new AnalysisPredicateScript.Token(this);
|
||||
|
|
|
@ -55,37 +55,33 @@ import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
|
|||
*/
|
||||
@Deprecated
|
||||
public final class SnowballAnalyzer extends Analyzer {
|
||||
private String name;
|
||||
private CharArraySet stopSet;
|
||||
private String name;
|
||||
private CharArraySet stopSet;
|
||||
|
||||
/** Builds the named analyzer with no stop words. */
|
||||
SnowballAnalyzer(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
/** Builds the named analyzer with no stop words. */
|
||||
SnowballAnalyzer(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/** Builds the named analyzer with the given stop words. */
|
||||
SnowballAnalyzer(String name, CharArraySet stopWords) {
|
||||
this(name);
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopWords));
|
||||
}
|
||||
/** Builds the named analyzer with the given stop words. */
|
||||
SnowballAnalyzer(String name, CharArraySet stopWords) {
|
||||
this(name);
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopWords));
|
||||
}
|
||||
|
||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link LowerCaseFilter}, a {@link StopFilter},
|
||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link LowerCaseFilter}, a {@link StopFilter},
|
||||
and a {@link SnowballFilter} */
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer tokenizer = new StandardTokenizer();
|
||||
TokenStream result = tokenizer;
|
||||
// remove the possessive 's for english stemmers
|
||||
if (name.equals("English") || name.equals("Porter") || name.equals("Lovins"))
|
||||
result = new EnglishPossessiveFilter(result);
|
||||
// Use a special lowercase filter for turkish, the stemmer expects it.
|
||||
if (name.equals("Turkish"))
|
||||
result = new TurkishLowerCaseFilter(result);
|
||||
else
|
||||
result = new LowerCaseFilter(result);
|
||||
if (stopSet != null)
|
||||
result = new StopFilter(result, stopSet);
|
||||
result = new SnowballFilter(result, name);
|
||||
return new TokenStreamComponents(tokenizer, result);
|
||||
}
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer tokenizer = new StandardTokenizer();
|
||||
TokenStream result = tokenizer;
|
||||
// remove the possessive 's for english stemmers
|
||||
if (name.equals("English") || name.equals("Porter") || name.equals("Lovins")) result = new EnglishPossessiveFilter(result);
|
||||
// Use a special lowercase filter for turkish, the stemmer expects it.
|
||||
if (name.equals("Turkish")) result = new TurkishLowerCaseFilter(result);
|
||||
else result = new LowerCaseFilter(result);
|
||||
if (stopSet != null) result = new StopFilter(result, stopSet);
|
||||
result = new SnowballFilter(result, name);
|
||||
return new TokenStreamComponents(tokenizer, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -50,6 +50,7 @@ public class StandardHtmlStripAnalyzer extends StopwordAnalyzerBase {
|
|||
public StandardHtmlStripAnalyzer() {
|
||||
super(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated in 6.5, can not create in 7.0, and we remove this in 8.0
|
||||
*/
|
||||
|
|
|
@ -43,8 +43,7 @@ import org.opensearch.index.analysis.Analysis;
|
|||
|
||||
public class StandardHtmlStripAnalyzerProvider extends AbstractIndexAnalyzerProvider<StandardHtmlStripAnalyzer> {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER =
|
||||
DeprecationLogger.getLogger(StandardHtmlStripAnalyzerProvider.class);
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(StandardHtmlStripAnalyzerProvider.class);
|
||||
|
||||
private final StandardHtmlStripAnalyzer analyzer;
|
||||
|
||||
|
@ -52,19 +51,23 @@ public class StandardHtmlStripAnalyzerProvider extends AbstractIndexAnalyzerProv
|
|||
* @deprecated in 6.5, can not create in 7.0, and we remove this in 8.0
|
||||
*/
|
||||
@Deprecated
|
||||
StandardHtmlStripAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
StandardHtmlStripAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
final CharArraySet defaultStopwords = CharArraySet.EMPTY_SET;
|
||||
CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
|
||||
analyzer = new StandardHtmlStripAnalyzer(stopWords);
|
||||
analyzer.setVersion(version);
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("[standard_html_strip] analyzer is not supported for new indices, " +
|
||||
"use a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter");
|
||||
throw new IllegalArgumentException(
|
||||
"[standard_html_strip] analyzer is not supported for new indices, "
|
||||
+ "use a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter"
|
||||
);
|
||||
} else {
|
||||
DEPRECATION_LOGGER.deprecate("standard_html_strip_deprecation",
|
||||
"Deprecated analyzer [standard_html_strip] used, " +
|
||||
"replace it with a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter");
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"standard_html_strip_deprecation",
|
||||
"Deprecated analyzer [standard_html_strip] used, "
|
||||
+ "replace it with a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -142,138 +142,139 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
// English stemmers
|
||||
} else if ("english".equalsIgnoreCase(language)) {
|
||||
return new PorterStemFilter(tokenStream);
|
||||
} else if ("light_english".equalsIgnoreCase(language) || "lightEnglish".equalsIgnoreCase(language)
|
||||
|| "kstem".equalsIgnoreCase(language)) {
|
||||
return new KStemFilter(tokenStream);
|
||||
} else if ("lovins".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new LovinsStemmer());
|
||||
} else if ("porter".equalsIgnoreCase(language)) {
|
||||
return new PorterStemFilter(tokenStream);
|
||||
} else if ("porter2".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new EnglishStemmer());
|
||||
} else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) {
|
||||
return new EnglishMinimalStemFilter(tokenStream);
|
||||
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
|
||||
return new EnglishPossessiveFilter(tokenStream);
|
||||
} else if ("light_english".equalsIgnoreCase(language)
|
||||
|| "lightEnglish".equalsIgnoreCase(language)
|
||||
|| "kstem".equalsIgnoreCase(language)) {
|
||||
return new KStemFilter(tokenStream);
|
||||
} else if ("lovins".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new LovinsStemmer());
|
||||
} else if ("porter".equalsIgnoreCase(language)) {
|
||||
return new PorterStemFilter(tokenStream);
|
||||
} else if ("porter2".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new EnglishStemmer());
|
||||
} else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) {
|
||||
return new EnglishMinimalStemFilter(tokenStream);
|
||||
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
|
||||
return new EnglishPossessiveFilter(tokenStream);
|
||||
|
||||
} else if ("estonian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new EstonianStemmer());
|
||||
} else if ("estonian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new EstonianStemmer());
|
||||
|
||||
// Finnish stemmers
|
||||
} else if ("finnish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new FinnishStemmer());
|
||||
} else if ("light_finish".equalsIgnoreCase(language) || "lightFinish".equalsIgnoreCase(language)) {
|
||||
// leaving this for backward compatibility
|
||||
return new FinnishLightStemFilter(tokenStream);
|
||||
} else if ("light_finnish".equalsIgnoreCase(language) || "lightFinnish".equalsIgnoreCase(language)) {
|
||||
return new FinnishLightStemFilter(tokenStream);
|
||||
// Finnish stemmers
|
||||
} else if ("finnish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new FinnishStemmer());
|
||||
} else if ("light_finish".equalsIgnoreCase(language) || "lightFinish".equalsIgnoreCase(language)) {
|
||||
// leaving this for backward compatibility
|
||||
return new FinnishLightStemFilter(tokenStream);
|
||||
} else if ("light_finnish".equalsIgnoreCase(language) || "lightFinnish".equalsIgnoreCase(language)) {
|
||||
return new FinnishLightStemFilter(tokenStream);
|
||||
|
||||
// French stemmers
|
||||
} else if ("french".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new FrenchStemmer());
|
||||
} else if ("light_french".equalsIgnoreCase(language) || "lightFrench".equalsIgnoreCase(language)) {
|
||||
return new FrenchLightStemFilter(tokenStream);
|
||||
} else if ("minimal_french".equalsIgnoreCase(language) || "minimalFrench".equalsIgnoreCase(language)) {
|
||||
return new FrenchMinimalStemFilter(tokenStream);
|
||||
// French stemmers
|
||||
} else if ("french".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new FrenchStemmer());
|
||||
} else if ("light_french".equalsIgnoreCase(language) || "lightFrench".equalsIgnoreCase(language)) {
|
||||
return new FrenchLightStemFilter(tokenStream);
|
||||
} else if ("minimal_french".equalsIgnoreCase(language) || "minimalFrench".equalsIgnoreCase(language)) {
|
||||
return new FrenchMinimalStemFilter(tokenStream);
|
||||
|
||||
// Galician stemmers
|
||||
} else if ("galician".equalsIgnoreCase(language)) {
|
||||
return new GalicianStemFilter(tokenStream);
|
||||
} else if ("minimal_galician".equalsIgnoreCase(language)) {
|
||||
return new GalicianMinimalStemFilter(tokenStream);
|
||||
// Galician stemmers
|
||||
} else if ("galician".equalsIgnoreCase(language)) {
|
||||
return new GalicianStemFilter(tokenStream);
|
||||
} else if ("minimal_galician".equalsIgnoreCase(language)) {
|
||||
return new GalicianMinimalStemFilter(tokenStream);
|
||||
|
||||
// German stemmers
|
||||
} else if ("german".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new GermanStemmer());
|
||||
} else if ("german2".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new German2Stemmer());
|
||||
} else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) {
|
||||
return new GermanLightStemFilter(tokenStream);
|
||||
} else if ("minimal_german".equalsIgnoreCase(language) || "minimalGerman".equalsIgnoreCase(language)) {
|
||||
return new GermanMinimalStemFilter(tokenStream);
|
||||
// German stemmers
|
||||
} else if ("german".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new GermanStemmer());
|
||||
} else if ("german2".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new German2Stemmer());
|
||||
} else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) {
|
||||
return new GermanLightStemFilter(tokenStream);
|
||||
} else if ("minimal_german".equalsIgnoreCase(language) || "minimalGerman".equalsIgnoreCase(language)) {
|
||||
return new GermanMinimalStemFilter(tokenStream);
|
||||
|
||||
} else if ("greek".equalsIgnoreCase(language)) {
|
||||
return new GreekStemFilter(tokenStream);
|
||||
} else if ("hindi".equalsIgnoreCase(language)) {
|
||||
return new HindiStemFilter(tokenStream);
|
||||
} else if ("greek".equalsIgnoreCase(language)) {
|
||||
return new GreekStemFilter(tokenStream);
|
||||
} else if ("hindi".equalsIgnoreCase(language)) {
|
||||
return new HindiStemFilter(tokenStream);
|
||||
|
||||
// Hungarian stemmers
|
||||
} else if ("hungarian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new HungarianStemmer());
|
||||
} else if ("light_hungarian".equalsIgnoreCase(language) || "lightHungarian".equalsIgnoreCase(language)) {
|
||||
return new HungarianLightStemFilter(tokenStream);
|
||||
// Hungarian stemmers
|
||||
} else if ("hungarian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new HungarianStemmer());
|
||||
} else if ("light_hungarian".equalsIgnoreCase(language) || "lightHungarian".equalsIgnoreCase(language)) {
|
||||
return new HungarianLightStemFilter(tokenStream);
|
||||
|
||||
} else if ("indonesian".equalsIgnoreCase(language)) {
|
||||
return new IndonesianStemFilter(tokenStream);
|
||||
} else if ("indonesian".equalsIgnoreCase(language)) {
|
||||
return new IndonesianStemFilter(tokenStream);
|
||||
|
||||
// Irish stemmer
|
||||
} else if ("irish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new IrishStemmer());
|
||||
// Irish stemmer
|
||||
} else if ("irish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new IrishStemmer());
|
||||
|
||||
// Italian stemmers
|
||||
} else if ("italian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new ItalianStemmer());
|
||||
} else if ("light_italian".equalsIgnoreCase(language) || "lightItalian".equalsIgnoreCase(language)) {
|
||||
return new ItalianLightStemFilter(tokenStream);
|
||||
// Italian stemmers
|
||||
} else if ("italian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new ItalianStemmer());
|
||||
} else if ("light_italian".equalsIgnoreCase(language) || "lightItalian".equalsIgnoreCase(language)) {
|
||||
return new ItalianLightStemFilter(tokenStream);
|
||||
|
||||
} else if ("latvian".equalsIgnoreCase(language)) {
|
||||
return new LatvianStemFilter(tokenStream);
|
||||
} else if ("latvian".equalsIgnoreCase(language)) {
|
||||
return new LatvianStemFilter(tokenStream);
|
||||
|
||||
} else if ("lithuanian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new LithuanianStemmer());
|
||||
} else if ("lithuanian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new LithuanianStemmer());
|
||||
|
||||
// Norwegian (Bokmål) stemmers
|
||||
} else if ("norwegian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new NorwegianStemmer());
|
||||
} else if ("light_norwegian".equalsIgnoreCase(language) || "lightNorwegian".equalsIgnoreCase(language)) {
|
||||
return new NorwegianLightStemFilter(tokenStream);
|
||||
} else if ("minimal_norwegian".equalsIgnoreCase(language) || "minimalNorwegian".equals(language)) {
|
||||
return new NorwegianMinimalStemFilter(tokenStream);
|
||||
// Norwegian (Bokmål) stemmers
|
||||
} else if ("norwegian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new NorwegianStemmer());
|
||||
} else if ("light_norwegian".equalsIgnoreCase(language) || "lightNorwegian".equalsIgnoreCase(language)) {
|
||||
return new NorwegianLightStemFilter(tokenStream);
|
||||
} else if ("minimal_norwegian".equalsIgnoreCase(language) || "minimalNorwegian".equals(language)) {
|
||||
return new NorwegianMinimalStemFilter(tokenStream);
|
||||
|
||||
// Norwegian (Nynorsk) stemmers
|
||||
} else if ("light_nynorsk".equalsIgnoreCase(language) || "lightNynorsk".equalsIgnoreCase(language)) {
|
||||
return new NorwegianLightStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);
|
||||
} else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) {
|
||||
return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);
|
||||
// Norwegian (Nynorsk) stemmers
|
||||
} else if ("light_nynorsk".equalsIgnoreCase(language) || "lightNynorsk".equalsIgnoreCase(language)) {
|
||||
return new NorwegianLightStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);
|
||||
} else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) {
|
||||
return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);
|
||||
|
||||
// Portuguese stemmers
|
||||
} else if ("portuguese".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new PortugueseStemmer());
|
||||
} else if ("light_portuguese".equalsIgnoreCase(language) || "lightPortuguese".equalsIgnoreCase(language)) {
|
||||
return new PortugueseLightStemFilter(tokenStream);
|
||||
} else if ("minimal_portuguese".equalsIgnoreCase(language) || "minimalPortuguese".equalsIgnoreCase(language)) {
|
||||
return new PortugueseMinimalStemFilter(tokenStream);
|
||||
} else if ("portuguese_rslp".equalsIgnoreCase(language)) {
|
||||
return new PortugueseStemFilter(tokenStream);
|
||||
// Portuguese stemmers
|
||||
} else if ("portuguese".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new PortugueseStemmer());
|
||||
} else if ("light_portuguese".equalsIgnoreCase(language) || "lightPortuguese".equalsIgnoreCase(language)) {
|
||||
return new PortugueseLightStemFilter(tokenStream);
|
||||
} else if ("minimal_portuguese".equalsIgnoreCase(language) || "minimalPortuguese".equalsIgnoreCase(language)) {
|
||||
return new PortugueseMinimalStemFilter(tokenStream);
|
||||
} else if ("portuguese_rslp".equalsIgnoreCase(language)) {
|
||||
return new PortugueseStemFilter(tokenStream);
|
||||
|
||||
} else if ("romanian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new RomanianStemmer());
|
||||
} else if ("romanian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new RomanianStemmer());
|
||||
|
||||
// Russian stemmers
|
||||
} else if ("russian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new RussianStemmer());
|
||||
} else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) {
|
||||
return new RussianLightStemFilter(tokenStream);
|
||||
// Russian stemmers
|
||||
} else if ("russian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new RussianStemmer());
|
||||
} else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) {
|
||||
return new RussianLightStemFilter(tokenStream);
|
||||
|
||||
// Spanish stemmers
|
||||
} else if ("spanish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new SpanishStemmer());
|
||||
} else if ("light_spanish".equalsIgnoreCase(language) || "lightSpanish".equalsIgnoreCase(language)) {
|
||||
return new SpanishLightStemFilter(tokenStream);
|
||||
// Spanish stemmers
|
||||
} else if ("spanish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new SpanishStemmer());
|
||||
} else if ("light_spanish".equalsIgnoreCase(language) || "lightSpanish".equalsIgnoreCase(language)) {
|
||||
return new SpanishLightStemFilter(tokenStream);
|
||||
|
||||
// Sorani Kurdish stemmer
|
||||
} else if ("sorani".equalsIgnoreCase(language)) {
|
||||
return new SoraniStemFilter(tokenStream);
|
||||
// Sorani Kurdish stemmer
|
||||
} else if ("sorani".equalsIgnoreCase(language)) {
|
||||
return new SoraniStemFilter(tokenStream);
|
||||
|
||||
// Swedish stemmers
|
||||
} else if ("swedish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new SwedishStemmer());
|
||||
} else if ("light_swedish".equalsIgnoreCase(language) || "lightSwedish".equalsIgnoreCase(language)) {
|
||||
return new SwedishLightStemFilter(tokenStream);
|
||||
// Swedish stemmers
|
||||
} else if ("swedish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new SwedishStemmer());
|
||||
} else if ("light_swedish".equalsIgnoreCase(language) || "lightSwedish".equalsIgnoreCase(language)) {
|
||||
return new SwedishLightStemFilter(tokenStream);
|
||||
|
||||
} else if ("turkish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new TurkishStemmer());
|
||||
}
|
||||
} else if ("turkish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new TurkishStemmer());
|
||||
}
|
||||
|
||||
return new SnowballFilter(tokenStream, language);
|
||||
}
|
||||
|
|
|
@ -49,8 +49,7 @@ import java.util.function.Function;
|
|||
|
||||
public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
|
||||
|
||||
SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
|
||||
String name, Settings settings) {
|
||||
SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, env, name, settings);
|
||||
}
|
||||
|
||||
|
@ -60,9 +59,12 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(
|
||||
TokenizerFactory tokenizer,
|
||||
List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters
|
||||
) {
|
||||
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
|
||||
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
|
||||
final String name = name();
|
||||
|
|
|
@ -64,15 +64,16 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
protected final Environment environment;
|
||||
protected final AnalysisMode analysisMode;
|
||||
|
||||
SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env,
|
||||
String name, Settings settings) {
|
||||
SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
this.settings = settings;
|
||||
|
||||
if (settings.get("ignore_case") != null) {
|
||||
DEPRECATION_LOGGER.deprecate("synonym_ignore_case_option",
|
||||
"The ignore_case option on the synonym_graph filter is deprecated. " +
|
||||
"Instead, insert a lowercase filter in the filter chain before the synonym_graph filter.");
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"synonym_ignore_case_option",
|
||||
"The ignore_case option on the synonym_graph filter is deprecated. "
|
||||
+ "Instead, insert a lowercase filter in the filter chain before the synonym_graph filter."
|
||||
);
|
||||
}
|
||||
|
||||
this.expand = settings.getAsBoolean("expand", true);
|
||||
|
@ -94,9 +95,12 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(
|
||||
TokenizerFactory tokenizer,
|
||||
List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters
|
||||
) {
|
||||
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
|
||||
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
|
||||
final String name = name();
|
||||
|
@ -126,12 +130,17 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
};
|
||||
}
|
||||
|
||||
Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> tokenFilters, Function<String, TokenFilterFactory> allFilters) {
|
||||
return new CustomAnalyzer(tokenizer, charFilters.toArray(new CharFilterFactory[0]),
|
||||
tokenFilters.stream()
|
||||
.map(TokenFilterFactory::getSynonymFilter)
|
||||
.toArray(TokenFilterFactory[]::new));
|
||||
Analyzer buildSynonymAnalyzer(
|
||||
TokenizerFactory tokenizer,
|
||||
List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> tokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters
|
||||
) {
|
||||
return new CustomAnalyzer(
|
||||
tokenizer,
|
||||
charFilters.toArray(new CharFilterFactory[0]),
|
||||
tokenFilters.stream().map(TokenFilterFactory::getSynonymFilter).toArray(TokenFilterFactory[]::new)
|
||||
);
|
||||
}
|
||||
|
||||
SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
|
||||
|
|
|
@ -45,8 +45,7 @@ public class ThaiAnalyzerProvider extends AbstractIndexAnalyzerProvider<ThaiAnal
|
|||
|
||||
ThaiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
analyzer = new ThaiAnalyzer(
|
||||
Analysis.parseStopWords(env, settings, ThaiAnalyzer.getDefaultStopSet()));
|
||||
analyzer = new ThaiAnalyzer(Analysis.parseStopWords(env, settings, ThaiAnalyzer.getDefaultStopSet()));
|
||||
analyzer.setVersion(version);
|
||||
}
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ public class TrimTokenFilterFactory extends AbstractTokenFilterFactory implement
|
|||
TrimTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
if (settings.get(UPDATE_OFFSETS_KEY) != null) {
|
||||
throw new IllegalArgumentException(UPDATE_OFFSETS_KEY + " is not supported anymore. Please fix your analysis chain");
|
||||
throw new IllegalArgumentException(UPDATE_OFFSETS_KEY + " is not supported anymore. Please fix your analysis chain");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -97,5 +97,3 @@ class UniqueTokenFilter extends TokenFilter {
|
|||
previous.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -52,5 +52,3 @@ public class UpperCaseTokenFilterFactory extends AbstractTokenFilterFactory impl
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -62,16 +62,14 @@ import static org.opensearch.analysis.common.WordDelimiterTokenFilterFactory.par
|
|||
|
||||
public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER =
|
||||
DeprecationLogger.getLogger(WordDelimiterGraphTokenFilterFactory.class);
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(WordDelimiterGraphTokenFilterFactory.class);
|
||||
|
||||
private final byte[] charTypeTable;
|
||||
private final int flags;
|
||||
private final CharArraySet protoWords;
|
||||
private final boolean adjustOffsets;
|
||||
|
||||
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
|
||||
String name, Settings settings) {
|
||||
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
// Sample Format for the type table:
|
||||
|
@ -123,10 +121,11 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
|
|||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecate("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
} else {
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"synonym_tokenfilters",
|
||||
"Token filter [" + name() + "] will not be usable to parse synonyms after v7.0"
|
||||
);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -66,15 +66,13 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_
|
|||
|
||||
public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER =
|
||||
DeprecationLogger.getLogger(WordDelimiterTokenFilterFactory.class);
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(WordDelimiterTokenFilterFactory.class);
|
||||
|
||||
private final byte[] charTypeTable;
|
||||
private final int flags;
|
||||
private final CharArraySet protoWords;
|
||||
|
||||
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env,
|
||||
String name, Settings settings) {
|
||||
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
// Sample Format for the type table:
|
||||
|
@ -116,20 +114,18 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new WordDelimiterFilter(tokenStream,
|
||||
charTypeTable,
|
||||
flags,
|
||||
protoWords);
|
||||
return new WordDelimiterFilter(tokenStream, charTypeTable, flags, protoWords);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecate("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
} else {
|
||||
DEPRECATION_LOGGER.deprecate(
|
||||
"synonym_tokenfilters",
|
||||
"Token filter [" + name() + "] will not be usable to parse synonyms after v7.0"
|
||||
);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
@ -151,21 +147,16 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
SortedMap<Character, Byte> typeMap = new TreeMap<>();
|
||||
for (String rule : rules) {
|
||||
Matcher m = typePattern.matcher(rule);
|
||||
if (!m.find())
|
||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
|
||||
if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
|
||||
String lhs = parseString(m.group(1).trim());
|
||||
Byte rhs = parseType(m.group(2).trim());
|
||||
if (lhs.length() != 1)
|
||||
throw new RuntimeException("Invalid Mapping Rule : ["
|
||||
+ rule + "]. Only a single character is allowed.");
|
||||
if (rhs == null)
|
||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
|
||||
if (lhs.length() != 1) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
|
||||
if (rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
|
||||
typeMap.put(lhs.charAt(0), rhs);
|
||||
}
|
||||
|
||||
// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
|
||||
byte types[] = new byte[Math.max(
|
||||
typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
|
||||
byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
|
||||
for (int i = 0; i < types.length; i++)
|
||||
types[i] = WordDelimiterIterator.getType(i);
|
||||
for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
|
||||
|
@ -174,20 +165,13 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
}
|
||||
|
||||
private static Byte parseType(String s) {
|
||||
if (s.equals("LOWER"))
|
||||
return WordDelimiterFilter.LOWER;
|
||||
else if (s.equals("UPPER"))
|
||||
return WordDelimiterFilter.UPPER;
|
||||
else if (s.equals("ALPHA"))
|
||||
return WordDelimiterFilter.ALPHA;
|
||||
else if (s.equals("DIGIT"))
|
||||
return WordDelimiterFilter.DIGIT;
|
||||
else if (s.equals("ALPHANUM"))
|
||||
return WordDelimiterFilter.ALPHANUM;
|
||||
else if (s.equals("SUBWORD_DELIM"))
|
||||
return WordDelimiterFilter.SUBWORD_DELIM;
|
||||
else
|
||||
return null;
|
||||
if (s.equals("LOWER")) return WordDelimiterFilter.LOWER;
|
||||
else if (s.equals("UPPER")) return WordDelimiterFilter.UPPER;
|
||||
else if (s.equals("ALPHA")) return WordDelimiterFilter.ALPHA;
|
||||
else if (s.equals("DIGIT")) return WordDelimiterFilter.DIGIT;
|
||||
else if (s.equals("ALPHANUM")) return WordDelimiterFilter.ALPHANUM;
|
||||
else if (s.equals("SUBWORD_DELIM")) return WordDelimiterFilter.SUBWORD_DELIM;
|
||||
else return null;
|
||||
}
|
||||
|
||||
private static String parseString(String s) {
|
||||
|
@ -198,8 +182,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
while (readPos < len) {
|
||||
char c = s.charAt(readPos++);
|
||||
if (c == '\\') {
|
||||
if (readPos >= len)
|
||||
throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
if (readPos >= len) throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
c = s.charAt(readPos++);
|
||||
switch (c) {
|
||||
case '\\':
|
||||
|
@ -221,8 +204,7 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
c = '\f';
|
||||
break;
|
||||
case 'u':
|
||||
if (readPos + 3 >= len)
|
||||
throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
if (readPos + 3 >= len) throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
|
||||
readPos += 4;
|
||||
break;
|
||||
|
|
|
@ -85,8 +85,8 @@ class XLowerCaseTokenizer extends Tokenizer {
|
|||
assert start == -1;
|
||||
start = offset + bufferIndex - charCount;
|
||||
end = start;
|
||||
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
|
||||
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
|
||||
} else if (length >= buffer.length - 1) { // check if a supplementary could run out of bounds
|
||||
buffer = termAtt.resizeBuffer(2 + length); // make sure a supplementary fits in the buffer
|
||||
}
|
||||
end += charCount;
|
||||
length += Character.toChars(Character.toLowerCase(c), buffer, length); // buffer it, normalized
|
||||
|
|
|
@ -47,14 +47,15 @@ import java.io.StringReader;
|
|||
public class ASCIIFoldingTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
||||
public void testDefault() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
|
||||
String source = "Ansprüche";
|
||||
String[] expected = new String[]{"Anspruche"};
|
||||
String[] expected = new String[] { "Anspruche" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -62,15 +63,16 @@ public class ASCIIFoldingTokenFilterFactoryTests extends OpenSearchTokenStreamTe
|
|||
|
||||
public void testPreserveOriginal() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||
.put("index.analysis.filter.my_ascii_folding.preserve_original", true)
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||
.put("index.analysis.filter.my_ascii_folding.preserve_original", true)
|
||||
.build(),
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
|
||||
String source = "Ansprüche";
|
||||
String[] expected = new String[]{"Anspruche", "Ansprüche"};
|
||||
String[] expected = new String[] { "Anspruche", "Ansprüche" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -78,7 +80,7 @@ public class ASCIIFoldingTokenFilterFactoryTests extends OpenSearchTokenStreamTe
|
|||
// but the multi-term aware component still emits a single token
|
||||
tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
expected = new String[]{"Anspruche"};
|
||||
expected = new String[] { "Anspruche" };
|
||||
assertTokenStreamContents(tokenFilter.normalize(tokenizer), expected);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,7 +31,6 @@
|
|||
|
||||
package org.opensearch.analysis.common;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.opensearch.common.settings.Settings;
|
||||
|
@ -57,15 +56,15 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends OpenSe
|
|||
|
||||
public void testDefault() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.build(),
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
|
||||
"fi", "4000", "j", "2", "se", "O", "Neil"};
|
||||
String[] expected = new String[] { "Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -73,17 +72,17 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends OpenSe
|
|||
|
||||
public void testCatenateWords() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[] { "PowerShot", "500", "42", "wifi", "wifi", "4000", "j",
|
||||
"2", "se", "ONeil" };
|
||||
String[] expected = new String[] { "PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -91,17 +90,17 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends OpenSe
|
|||
|
||||
public void testCatenateNumbers() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[] { "Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000",
|
||||
"j", "2", "se", "O", "Neil" };
|
||||
String[] expected = new String[] { "Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -109,17 +108,18 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends OpenSe
|
|||
|
||||
public void testCatenateAll() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
|
||||
String[] expected = new String[] { "PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -127,15 +127,16 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends OpenSe
|
|||
|
||||
public void testSplitOnCaseChange() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot";
|
||||
String[] expected = new String[]{"PowerShot"};
|
||||
String[] expected = new String[] { "PowerShot" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -143,17 +144,36 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends OpenSe
|
|||
|
||||
public void testPreserveOriginal() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[] { "PowerShot", "Power", "Shot", "500-42", "500", "42",
|
||||
"wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se",
|
||||
"O'Neil's", "O", "Neil" };
|
||||
String[] expected = new String[] {
|
||||
"PowerShot",
|
||||
"Power",
|
||||
"Shot",
|
||||
"500-42",
|
||||
"500",
|
||||
"42",
|
||||
"wi-fi",
|
||||
"wi",
|
||||
"fi",
|
||||
"wi-fi-4000",
|
||||
"wi",
|
||||
"fi",
|
||||
"4000",
|
||||
"j2se",
|
||||
"j",
|
||||
"2",
|
||||
"se",
|
||||
"O'Neil's",
|
||||
"O",
|
||||
"Neil" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -161,16 +181,16 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends OpenSe
|
|||
|
||||
public void testStemEnglishPossessive() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[] { "Power", "Shot", "500", "42", "wi", "fi", "wi", "fi",
|
||||
"4000", "j", "2", "se", "O", "Neil", "s" };
|
||||
String[] expected = new String[] { "Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
|
|
@ -58,7 +58,7 @@ public class CJKFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
public void testDefault() throws IOException {
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram");
|
||||
String source = "多くの学生が試験に落ちた。";
|
||||
String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
|
||||
String[] expected = new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -67,7 +67,7 @@ public class CJKFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
public void testNoFlags() throws IOException {
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags");
|
||||
String source = "多くの学生が試験に落ちた。";
|
||||
String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
|
||||
String[] expected = new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -76,7 +76,7 @@ public class CJKFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
public void testHanOnly() throws IOException {
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only");
|
||||
String source = "多くの学生が試験に落ちた。";
|
||||
String[] expected = new String[]{"多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" };
|
||||
String[] expected = new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" };
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -85,7 +85,7 @@ public class CJKFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
public void testHanUnigramOnly() throws IOException {
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
|
||||
String source = "多くの学生が試験に落ちた。";
|
||||
String[] expected = new String[]{"多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" };
|
||||
String[] expected = new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" };
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
|
|
@ -54,23 +54,25 @@ public class CharGroupTokenizerFactoryTests extends OpenSearchTokenStreamTestCas
|
|||
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
|
||||
final String name = "cg";
|
||||
for (String[] conf : Arrays.asList(
|
||||
new String[] { "\\v" },
|
||||
new String[] { "\\u00245" },
|
||||
new String[] { "commas" },
|
||||
new String[] { "a", "b", "c", "\\$" })) {
|
||||
new String[] { "\\v" },
|
||||
new String[] { "\\u00245" },
|
||||
new String[] { "commas" },
|
||||
new String[] { "a", "b", "c", "\\$" }
|
||||
)) {
|
||||
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build();
|
||||
expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).create());
|
||||
}
|
||||
|
||||
for (String[] conf : Arrays.asList(
|
||||
new String[0],
|
||||
new String[] { "\\n" },
|
||||
new String[] { "\\u0024" },
|
||||
new String[] { "whitespace" },
|
||||
new String[] { "a", "b", "c" },
|
||||
new String[] { "a", "b", "c", "\\r" },
|
||||
new String[] { "\\r" },
|
||||
new String[] { "f", "o", "o", "symbol" })) {
|
||||
new String[0],
|
||||
new String[] { "\\n" },
|
||||
new String[] { "\\u0024" },
|
||||
new String[] { "whitespace" },
|
||||
new String[] { "a", "b", "c" },
|
||||
new String[] { "a", "b", "c", "\\r" },
|
||||
new String[] { "\\r" },
|
||||
new String[] { "f", "o", "o", "symbol" }
|
||||
)) {
|
||||
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", Arrays.asList(conf)).build();
|
||||
new CharGroupTokenizerFactory(indexProperties, null, name, settings).create();
|
||||
// no exception
|
||||
|
@ -83,22 +85,21 @@ public class CharGroupTokenizerFactoryTests extends OpenSearchTokenStreamTestCas
|
|||
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
|
||||
final String name = "cg";
|
||||
|
||||
String[] conf = new String[] {"-"};
|
||||
String[] conf = new String[] { "-" };
|
||||
|
||||
final Settings defaultLengthSettings = newAnalysisSettingsBuilder()
|
||||
.putList("tokenize_on_chars", conf)
|
||||
.build();
|
||||
final Settings defaultLengthSettings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build();
|
||||
CharTokenizer tokenizer = (CharTokenizer) new CharGroupTokenizerFactory(indexProperties, null, name, defaultLengthSettings)
|
||||
.create();
|
||||
String textWithVeryLongToken = RandomStrings.randomAsciiAlphanumOfLength(random(), 256).concat("-trailing");
|
||||
try (Reader reader = new StringReader(textWithVeryLongToken)) {
|
||||
tokenizer.setReader(reader);
|
||||
assertTokenStreamContents(tokenizer, new String[] { textWithVeryLongToken.substring(0, 255),
|
||||
textWithVeryLongToken.substring(255, 256), "trailing"});
|
||||
assertTokenStreamContents(
|
||||
tokenizer,
|
||||
new String[] { textWithVeryLongToken.substring(0, 255), textWithVeryLongToken.substring(255, 256), "trailing" }
|
||||
);
|
||||
}
|
||||
|
||||
final Settings analysisSettings = newAnalysisSettingsBuilder()
|
||||
.putList("tokenize_on_chars", conf)
|
||||
final Settings analysisSettings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf)
|
||||
.put("max_token_length", 2)
|
||||
.build();
|
||||
tokenizer = (CharTokenizer) new CharGroupTokenizerFactory(indexProperties, null, name, analysisSettings).create();
|
||||
|
@ -107,20 +108,22 @@ public class CharGroupTokenizerFactoryTests extends OpenSearchTokenStreamTestCas
|
|||
assertTokenStreamContents(tokenizer, new String[] { "on", "e", "tw", "o", "th", "re", "e" });
|
||||
}
|
||||
|
||||
final Settings tooLongLengthSettings = newAnalysisSettingsBuilder()
|
||||
.putList("tokenize_on_chars", conf)
|
||||
final Settings tooLongLengthSettings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf)
|
||||
.put("max_token_length", 1024 * 1024 + 1)
|
||||
.build();
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> new CharGroupTokenizerFactory(indexProperties, null, name, tooLongLengthSettings).create());
|
||||
IllegalArgumentException e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> new CharGroupTokenizerFactory(indexProperties, null, name, tooLongLengthSettings).create()
|
||||
);
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 1048577", e.getMessage());
|
||||
|
||||
final Settings negativeLengthSettings = newAnalysisSettingsBuilder()
|
||||
.putList("tokenize_on_chars", conf)
|
||||
final Settings negativeLengthSettings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf)
|
||||
.put("max_token_length", -1)
|
||||
.build();
|
||||
e = expectThrows(IllegalArgumentException.class,
|
||||
() -> new CharGroupTokenizerFactory(indexProperties, null, name, negativeLengthSettings).create());
|
||||
e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> new CharGroupTokenizerFactory(indexProperties, null, name, negativeLengthSettings).create()
|
||||
);
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
|
||||
}
|
||||
|
||||
|
@ -129,9 +132,13 @@ public class CharGroupTokenizerFactoryTests extends OpenSearchTokenStreamTestCas
|
|||
final String name = "cg";
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", "whitespace", ":", "\\u0024").build();
|
||||
Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null, name, settings).create();
|
||||
Tokenizer tokenizer = new CharGroupTokenizerFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
name,
|
||||
settings
|
||||
).create();
|
||||
tokenizer.setReader(new StringReader("foo bar $34 test:test2"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"foo", "bar", "34", "test", "test2"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "foo", "bar", "34", "test", "test2" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -161,13 +161,13 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
|||
@Override
|
||||
protected Map<String, Class<?>> getCharFilters() {
|
||||
Map<String, Class<?>> filters = new TreeMap<>(super.getCharFilters());
|
||||
filters.put("htmlstrip", HtmlStripCharFilterFactory.class);
|
||||
filters.put("mapping", MappingCharFilterFactory.class);
|
||||
filters.put("htmlstrip", HtmlStripCharFilterFactory.class);
|
||||
filters.put("mapping", MappingCharFilterFactory.class);
|
||||
filters.put("patternreplace", PatternReplaceCharFilterFactory.class);
|
||||
|
||||
// TODO: these charfilters are not yet exposed: useful?
|
||||
// handling of zwnj for persian
|
||||
filters.put("persian", Void.class);
|
||||
filters.put("persian", Void.class);
|
||||
return filters;
|
||||
}
|
||||
|
||||
|
@ -280,12 +280,16 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
|||
}
|
||||
|
||||
private void markedTestCase(String name, Map<String, Class<?>> map) {
|
||||
List<String> unmarked = map.entrySet().stream()
|
||||
.filter(e -> e.getValue() == MovedToAnalysisCommon.class)
|
||||
.map(Map.Entry::getKey)
|
||||
.sorted()
|
||||
.collect(toList());
|
||||
assertEquals(name + " marked in AnalysisFactoryTestCase as moved to analysis-common "
|
||||
+ "but not mapped here", emptyList(), unmarked);
|
||||
List<String> unmarked = map.entrySet()
|
||||
.stream()
|
||||
.filter(e -> e.getValue() == MovedToAnalysisCommon.class)
|
||||
.map(Map.Entry::getKey)
|
||||
.sorted()
|
||||
.collect(toList());
|
||||
assertEquals(
|
||||
name + " marked in AnalysisFactoryTestCase as moved to analysis-common " + "but not mapped here",
|
||||
emptyList(),
|
||||
unmarked
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -56,10 +56,16 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
* Check that the deprecated name "nGram" issues a deprecation warning for indices created since 6.0.0
|
||||
*/
|
||||
public void testNGramDeprecationWarning() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(
|
||||
random(), LegacyESVersion.V_6_0_0, VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)))
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(
|
||||
random(),
|
||||
LegacyESVersion.V_6_0_0,
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)
|
||||
)
|
||||
)
|
||||
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.custom_analyzer.filter", "nGram")
|
||||
|
@ -69,15 +75,18 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
|
||||
}
|
||||
|
||||
assertWarnings("The [nGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [ngram] instead.");
|
||||
assertWarnings(
|
||||
"The [nGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [ngram] instead."
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that the deprecated name "nGram" throws an error since 7.0.0
|
||||
*/
|
||||
public void testNGramDeprecationError() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, null))
|
||||
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
|
||||
|
@ -85,10 +94,15 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
.build();
|
||||
|
||||
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
|
||||
assertEquals("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
|
||||
+ "Please change the filter name to [ngram] instead.", e.getMessage());
|
||||
IllegalArgumentException e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin)
|
||||
);
|
||||
assertEquals(
|
||||
"The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
|
||||
+ "Please change the filter name to [ngram] instead.",
|
||||
e.getMessage()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,10 +110,16 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
* Check that the deprecated name "edgeNGram" issues a deprecation warning for indices created since 6.0.0
|
||||
*/
|
||||
public void testEdgeNGramDeprecationWarning() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(
|
||||
random(), LegacyESVersion.V_6_4_0, VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)))
|
||||
random(),
|
||||
LegacyESVersion.V_6_4_0,
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)
|
||||
)
|
||||
)
|
||||
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.custom_analyzer.filter", "edgeNGram")
|
||||
|
@ -108,15 +128,18 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
|
||||
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
|
||||
}
|
||||
assertWarnings("The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [edge_ngram] instead.");
|
||||
assertWarnings(
|
||||
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [edge_ngram] instead."
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that the deprecated name "edgeNGram" throws an error for indices created since 7.0.0
|
||||
*/
|
||||
public void testEdgeNGramDeprecationError() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, null))
|
||||
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
|
||||
|
@ -124,10 +147,15 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
.build();
|
||||
|
||||
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
|
||||
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
|
||||
() -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
|
||||
assertEquals("The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
|
||||
+ "Please change the filter name to [edge_ngram] instead.", ex.getMessage());
|
||||
IllegalArgumentException ex = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin)
|
||||
);
|
||||
assertEquals(
|
||||
"The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
|
||||
+ "Please change the filter name to [edge_ngram] instead.",
|
||||
ex.getMessage()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -135,29 +163,43 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
* Check that the deprecated analyzer name "standard_html_strip" throws exception for indices created since 7.0.0
|
||||
*/
|
||||
public void testStandardHtmlStripAnalyzerDeprecationError() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT))
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT)
|
||||
)
|
||||
.put("index.analysis.analyzer.custom_analyzer.type", "standard_html_strip")
|
||||
.putList("index.analysis.analyzer.custom_analyzer.stopwords", "a", "b")
|
||||
.build();
|
||||
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin();
|
||||
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
|
||||
() -> createTestAnalysis(idxSettings, settings, commonAnalysisPlugin));
|
||||
assertEquals("[standard_html_strip] analyzer is not supported for new indices, " +
|
||||
"use a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter", ex.getMessage());
|
||||
IllegalArgumentException ex = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> createTestAnalysis(idxSettings, settings, commonAnalysisPlugin)
|
||||
);
|
||||
assertEquals(
|
||||
"[standard_html_strip] analyzer is not supported for new indices, "
|
||||
+ "use a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter",
|
||||
ex.getMessage()
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that the deprecated analyzer name "standard_html_strip" issues a deprecation warning for indices created since 6.5.0 until 7
|
||||
*/
|
||||
public void testStandardHtmlStripAnalyzerDeprecationWarning() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_6_0_0,
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)))
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(
|
||||
random(),
|
||||
LegacyESVersion.V_6_0_0,
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)
|
||||
)
|
||||
)
|
||||
.put("index.analysis.analyzer.custom_analyzer.type", "standard_html_strip")
|
||||
.putList("index.analysis.analyzer.custom_analyzer.stopwords", "a", "b")
|
||||
.build();
|
||||
|
@ -168,8 +210,9 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
Analyzer analyzer = analyzers.get("custom_analyzer");
|
||||
assertNotNull(((NamedAnalyzer) analyzer).analyzer());
|
||||
assertWarnings(
|
||||
"Deprecated analyzer [standard_html_strip] used, " +
|
||||
"replace it with a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter");
|
||||
"Deprecated analyzer [standard_html_strip] used, "
|
||||
+ "replace it with a custom analyzer using [standard] tokenizer and [html_strip] char_filter, plus [lowercase] filter"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -177,9 +220,12 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
* Check that the deprecated "nGram" filter logs a warning when the filter is used as a custom filter
|
||||
*/
|
||||
public void testnGramFilterInCustomAnalyzerDeprecationError() throws IOException {
|
||||
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT))
|
||||
final Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT)
|
||||
)
|
||||
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.custom_analyzer.filter", "my_ngram")
|
||||
|
@ -189,17 +235,22 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
final CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin();
|
||||
|
||||
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
|
||||
assertWarnings("The [nGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [ngram] instead.");
|
||||
assertWarnings(
|
||||
"The [nGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [ngram] instead."
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that the deprecated "edgeNGram" filter logs a warning when the filter is used as a custom filter
|
||||
*/
|
||||
public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOException {
|
||||
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT))
|
||||
final Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT)
|
||||
)
|
||||
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.custom_analyzer.filter", "my_ngram")
|
||||
|
@ -208,8 +259,10 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
final CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin();
|
||||
|
||||
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
|
||||
assertWarnings("The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [edge_ngram] instead.");
|
||||
assertWarnings(
|
||||
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [edge_ngram] instead."
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -218,56 +271,90 @@ public class CommonAnalysisPluginTests extends OpenSearchTestCase {
|
|||
*/
|
||||
public void testNGramTokenizerDeprecation() throws IOException {
|
||||
// tests for prebuilt tokenizer
|
||||
doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2), false);
|
||||
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2), false);
|
||||
doTestPrebuiltTokenizerDeprecation(
|
||||
"nGram",
|
||||
"ngram",
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2),
|
||||
false
|
||||
);
|
||||
doTestPrebuiltTokenizerDeprecation(
|
||||
"edgeNGram",
|
||||
"edge_ngram",
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2),
|
||||
false
|
||||
);
|
||||
doTestPrebuiltTokenizerDeprecation("nGram", "ngram", LegacyESVersion.V_7_6_0, true);
|
||||
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram", LegacyESVersion.V_7_6_0, true);
|
||||
|
||||
// same batch of tests for custom tokenizer definition in the settings
|
||||
doTestCustomTokenizerDeprecation("nGram", "ngram",
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2), false);
|
||||
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2), false);
|
||||
doTestCustomTokenizerDeprecation(
|
||||
"nGram",
|
||||
"ngram",
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2),
|
||||
false
|
||||
);
|
||||
doTestCustomTokenizerDeprecation(
|
||||
"edgeNGram",
|
||||
"edge_ngram",
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2),
|
||||
false
|
||||
);
|
||||
doTestCustomTokenizerDeprecation("nGram", "ngram", LegacyESVersion.V_7_6_0, true);
|
||||
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram", LegacyESVersion.V_7_6_0, true);
|
||||
}
|
||||
|
||||
public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
|
||||
throws IOException {
|
||||
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, version).build();
|
||||
throws IOException {
|
||||
final Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, version)
|
||||
.build();
|
||||
|
||||
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
|
||||
Map<String, TokenizerFactory> tokenizers = createTestAnalysis(
|
||||
IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer;
|
||||
IndexSettingsModule.newIndexSettings("index", settings),
|
||||
settings,
|
||||
commonAnalysisPlugin
|
||||
).tokenizer;
|
||||
TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName);
|
||||
|
||||
Tokenizer tokenizer = tokenizerFactory.create();
|
||||
assertNotNull(tokenizer);
|
||||
if (expectWarning) {
|
||||
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to [" + replacement + "] instead.");
|
||||
assertWarnings(
|
||||
"The ["
|
||||
+ deprecatedName
|
||||
+ "] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to ["
|
||||
+ replacement
|
||||
+ "] instead."
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void doTestCustomTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
|
||||
throws IOException {
|
||||
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
throws IOException {
|
||||
final Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, version)
|
||||
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "my_tokenizer")
|
||||
.put("index.analysis.tokenizer.my_tokenizer.type", deprecatedName)
|
||||
.build();
|
||||
.build();
|
||||
|
||||
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
|
||||
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
|
||||
|
||||
if (expectWarning) {
|
||||
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to [" + replacement + "] instead.");
|
||||
assertWarnings(
|
||||
"The ["
|
||||
+ deprecatedName
|
||||
+ "] tokenizer name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the tokenizer name to ["
|
||||
+ replacement
|
||||
+ "] instead."
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,25 +53,25 @@ import java.nio.file.Path;
|
|||
public class CommonGramsTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
||||
public void testDefault() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.common_grams_default.type", "common_grams")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
.put("index.analysis.filter.common_grams_default.type", "common_grams")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
|
||||
try {
|
||||
AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
Assert.fail("[common_words] or [common_words_path] is set");
|
||||
} catch (IllegalArgumentException e) {
|
||||
} catch (IOException e) {
|
||||
} catch (IllegalArgumentException e) {} catch (IOException e) {
|
||||
fail("expected IAE");
|
||||
}
|
||||
}
|
||||
|
||||
public void testWithoutCommonWordsMatch() throws IOException {
|
||||
{
|
||||
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_default.type", "common_grams")
|
||||
.putList("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.common_grams_default.type", "common_grams")
|
||||
.putList("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
{
|
||||
|
@ -85,11 +85,12 @@ public class CommonGramsTokenFilterFactoryTests extends OpenSearchTokenStreamTes
|
|||
}
|
||||
|
||||
{
|
||||
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_default.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_default.query_mode", false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.putList("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein")
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.common_grams_default.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_default.query_mode", false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.putList("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein")
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
{
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_default");
|
||||
|
@ -104,45 +105,85 @@ public class CommonGramsTokenFilterFactoryTests extends OpenSearchTokenStreamTes
|
|||
|
||||
public void testSettings() throws IOException {
|
||||
{
|
||||
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_1.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_1.ignore_case", true)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.putList("index.analysis.filter.common_grams_1.common_words", "the", "Or", "Not", "a", "is", "an", "they", "are")
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.common_grams_1.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_1.ignore_case", true)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.putList("index.analysis.filter.common_grams_1.common_words", "the", "Or", "Not", "a", "is", "an", "they", "are")
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_1");
|
||||
String source = "the quick brown is a fox or noT";
|
||||
String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a",
|
||||
"a_fox", "fox", "fox_or", "or", "or_noT", "noT" };
|
||||
String[] expected = new String[] {
|
||||
"the",
|
||||
"the_quick",
|
||||
"quick",
|
||||
"brown",
|
||||
"brown_is",
|
||||
"is",
|
||||
"is_a",
|
||||
"a",
|
||||
"a_fox",
|
||||
"fox",
|
||||
"fox_or",
|
||||
"or",
|
||||
"or_noT",
|
||||
"noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
{
|
||||
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_2.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_2.ignore_case", false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.putList("index.analysis.filter.common_grams_2.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.common_grams_2.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_2.ignore_case", false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.putList("index.analysis.filter.common_grams_2.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_2");
|
||||
String source = "the quick brown is a fox or why noT";
|
||||
String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", "" +
|
||||
"a_fox", "fox", "or", "why", "why_noT", "noT" };
|
||||
String[] expected = new String[] {
|
||||
"the",
|
||||
"the_quick",
|
||||
"quick",
|
||||
"brown",
|
||||
"brown_is",
|
||||
"is",
|
||||
"is_a",
|
||||
"a",
|
||||
"" + "a_fox",
|
||||
"fox",
|
||||
"or",
|
||||
"why",
|
||||
"why_noT",
|
||||
"noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
{
|
||||
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_3.type", "common_grams")
|
||||
.putList("index.analysis.filter.common_grams_3.common_words", "the", "or", "not", "a", "is", "an", "they", "are")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.common_grams_3.type", "common_grams")
|
||||
.putList("index.analysis.filter.common_grams_3.common_words", "the", "or", "not", "a", "is", "an", "they", "are")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_3");
|
||||
String source = "the quick brown is a fox Or noT";
|
||||
String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a",
|
||||
"a_fox", "fox", "Or", "noT" };
|
||||
String[] expected = new String[] {
|
||||
"the",
|
||||
"the_quick",
|
||||
"quick",
|
||||
"brown",
|
||||
"brown_is",
|
||||
"is",
|
||||
"is_a",
|
||||
"a",
|
||||
"a_fox",
|
||||
"fox",
|
||||
"Or",
|
||||
"noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -152,37 +193,58 @@ public class CommonGramsTokenFilterFactoryTests extends OpenSearchTokenStreamTes
|
|||
public void testCommonGramsAnalysis() throws IOException {
|
||||
String json = "/org/opensearch/analysis/common/commongrams.json";
|
||||
Settings settings = Settings.builder()
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createHome())
|
||||
.build();
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createHome())
|
||||
.build();
|
||||
{
|
||||
IndexAnalyzers indexAnalyzers = createTestAnalysisFromSettings(settings)
|
||||
.indexAnalyzers;
|
||||
IndexAnalyzers indexAnalyzers = createTestAnalysisFromSettings(settings).indexAnalyzers;
|
||||
Analyzer analyzer = indexAnalyzers.get("commongramsAnalyzer").analyzer();
|
||||
String source = "the quick brown is a fox or not";
|
||||
String[] expected = new String[] { "the", "quick", "quick_brown", "brown", "brown_is", "is", "a", "a_fox",
|
||||
"fox", "fox_or", "or", "not" };
|
||||
String[] expected = new String[] {
|
||||
"the",
|
||||
"quick",
|
||||
"quick_brown",
|
||||
"brown",
|
||||
"brown_is",
|
||||
"is",
|
||||
"a",
|
||||
"a_fox",
|
||||
"fox",
|
||||
"fox_or",
|
||||
"or",
|
||||
"not" };
|
||||
assertTokenStreamContents(analyzer.tokenStream("test", source), expected);
|
||||
}
|
||||
{
|
||||
IndexAnalyzers indexAnalyzers = createTestAnalysisFromSettings(settings)
|
||||
.indexAnalyzers;
|
||||
IndexAnalyzers indexAnalyzers = createTestAnalysisFromSettings(settings).indexAnalyzers;
|
||||
Analyzer analyzer = indexAnalyzers.get("commongramsAnalyzer_file").analyzer();
|
||||
String source = "the quick brown is a fox or not";
|
||||
String[] expected = new String[] { "the", "quick", "quick_brown", "brown", "brown_is", "is", "a", "a_fox",
|
||||
"fox", "fox_or", "or", "not" };
|
||||
String[] expected = new String[] {
|
||||
"the",
|
||||
"quick",
|
||||
"quick_brown",
|
||||
"brown",
|
||||
"brown_is",
|
||||
"is",
|
||||
"a",
|
||||
"a_fox",
|
||||
"fox",
|
||||
"fox_or",
|
||||
"or",
|
||||
"not" };
|
||||
assertTokenStreamContents(analyzer.tokenStream("test", source), expected);
|
||||
}
|
||||
}
|
||||
|
||||
public void testQueryModeSettings() throws IOException {
|
||||
{
|
||||
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_1.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_1.query_mode", true)
|
||||
.putList("index.analysis.filter.common_grams_1.common_words", "the", "Or", "Not", "a", "is", "an", "they", "are")
|
||||
.put("index.analysis.filter.common_grams_1.ignore_case", true)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.common_grams_1.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_1.query_mode", true)
|
||||
.putList("index.analysis.filter.common_grams_1.common_words", "the", "Or", "Not", "a", "is", "an", "they", "are")
|
||||
.put("index.analysis.filter.common_grams_1.ignore_case", true)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_1");
|
||||
String source = "the quick brown is a fox or noT";
|
||||
|
@ -192,12 +254,13 @@ public class CommonGramsTokenFilterFactoryTests extends OpenSearchTokenStreamTes
|
|||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
{
|
||||
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_2.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_2.query_mode", true)
|
||||
.putList("index.analysis.filter.common_grams_2.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
|
||||
.put("index.analysis.filter.common_grams_2.ignore_case", false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.common_grams_2.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_2.query_mode", true)
|
||||
.putList("index.analysis.filter.common_grams_2.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
|
||||
.put("index.analysis.filter.common_grams_2.ignore_case", false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_2");
|
||||
String source = "the quick brown is a fox or why noT";
|
||||
|
@ -207,11 +270,12 @@ public class CommonGramsTokenFilterFactoryTests extends OpenSearchTokenStreamTes
|
|||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
{
|
||||
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_3.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_3.query_mode", true)
|
||||
.putList("index.analysis.filter.common_grams_3.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.common_grams_3.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_3.query_mode", true)
|
||||
.putList("index.analysis.filter.common_grams_3.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_3");
|
||||
String source = "the quick brown is a fox or why noT";
|
||||
|
@ -221,11 +285,12 @@ public class CommonGramsTokenFilterFactoryTests extends OpenSearchTokenStreamTes
|
|||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
{
|
||||
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_4.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_4.query_mode", true)
|
||||
.putList("index.analysis.filter.common_grams_4.common_words", "the", "or", "not", "a", "is", "an", "they", "are")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.common_grams_4.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_4.query_mode", true)
|
||||
.putList("index.analysis.filter.common_grams_4.common_words", "the", "or", "not", "a", "is", "an", "they", "are")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_4");
|
||||
String source = "the quick brown is a fox Or noT";
|
||||
|
@ -239,20 +304,18 @@ public class CommonGramsTokenFilterFactoryTests extends OpenSearchTokenStreamTes
|
|||
public void testQueryModeCommonGramsAnalysis() throws IOException {
|
||||
String json = "/org/opensearch/analysis/common/commongrams_query_mode.json";
|
||||
Settings settings = Settings.builder()
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createHome())
|
||||
.build();
|
||||
.build();
|
||||
{
|
||||
IndexAnalyzers indexAnalyzers = createTestAnalysisFromSettings(settings)
|
||||
.indexAnalyzers;
|
||||
IndexAnalyzers indexAnalyzers = createTestAnalysisFromSettings(settings).indexAnalyzers;
|
||||
Analyzer analyzer = indexAnalyzers.get("commongramsAnalyzer").analyzer();
|
||||
String source = "the quick brown is a fox or not";
|
||||
String[] expected = new String[] { "the", "quick_brown", "brown_is", "is", "a_fox", "fox_or", "or", "not" };
|
||||
assertTokenStreamContents(analyzer.tokenStream("test", source), expected);
|
||||
}
|
||||
{
|
||||
IndexAnalyzers indexAnalyzers = createTestAnalysisFromSettings(settings)
|
||||
.indexAnalyzers;
|
||||
IndexAnalyzers indexAnalyzers = createTestAnalysisFromSettings(settings).indexAnalyzers;
|
||||
Analyzer analyzer = indexAnalyzers.get("commongramsAnalyzer_file").analyzer();
|
||||
String source = "the quick brown is a fox or not";
|
||||
String[] expected = new String[] { "the", "quick_brown", "brown_is", "is", "a_fox", "fox_or", "or", "not" };
|
||||
|
|
|
@ -72,12 +72,14 @@ public class CompoundAnalysisTests extends OpenSearchTestCase {
|
|||
}
|
||||
|
||||
public void testDictionaryDecompounder() throws Exception {
|
||||
Settings[] settingsArr = new Settings[]{getJsonSettings(), getYamlSettings()};
|
||||
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
|
||||
for (Settings settings : settingsArr) {
|
||||
List<String> terms = analyze(settings, "decompoundingAnalyzer", "donaudampfschiff spargelcremesuppe");
|
||||
MatcherAssert.assertThat(terms.size(), equalTo(8));
|
||||
MatcherAssert.assertThat(terms,
|
||||
hasItems("donau", "dampf", "schiff", "donaudampfschiff", "spargel", "creme", "suppe", "spargelcremesuppe"));
|
||||
MatcherAssert.assertThat(
|
||||
terms,
|
||||
hasItems("donau", "dampf", "schiff", "donaudampfschiff", "spargel", "creme", "suppe", "spargelcremesuppe")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -87,7 +89,7 @@ public class CompoundAnalysisTests extends OpenSearchTestCase {
|
|||
IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings);
|
||||
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
|
||||
|
||||
TokenStream stream = analyzer.tokenStream("" , text);
|
||||
TokenStream stream = analyzer.tokenStream("", text);
|
||||
stream.reset();
|
||||
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
|
||||
|
||||
|
@ -112,18 +114,18 @@ public class CompoundAnalysisTests extends OpenSearchTestCase {
|
|||
private Settings getJsonSettings() throws IOException {
|
||||
String json = "/org/opensearch/analysis/common/test1.json";
|
||||
return Settings.builder()
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
}
|
||||
|
||||
private Settings getYamlSettings() throws IOException {
|
||||
String yaml = "/org/opensearch/analysis/common/test1.yml";
|
||||
return Settings.builder()
|
||||
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -91,79 +91,52 @@ public class DisableGraphQueryTests extends OpenSearchSingleNodeTestCase {
|
|||
.put("index.analysis.analyzer.text_shingle.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.text_shingle.filter", "lowercase, shingle")
|
||||
.put("index.analysis.analyzer.text_shingle_unigram.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.text_shingle_unigram.filter",
|
||||
"lowercase, shingle_unigram")
|
||||
.put("index.analysis.analyzer.text_shingle_unigram.filter", "lowercase, shingle_unigram")
|
||||
.build();
|
||||
indexService = createIndex("test", settings, "t",
|
||||
"text_shingle", "type=text,analyzer=text_shingle",
|
||||
"text_shingle_unigram", "type=text,analyzer=text_shingle_unigram");
|
||||
indexService = createIndex(
|
||||
"test",
|
||||
settings,
|
||||
"t",
|
||||
"text_shingle",
|
||||
"type=text,analyzer=text_shingle",
|
||||
"text_shingle_unigram",
|
||||
"type=text,analyzer=text_shingle_unigram"
|
||||
);
|
||||
shardContext = indexService.newQueryShardContext(0, null, () -> 0L, null);
|
||||
|
||||
// parsed queries for "text_shingle_unigram:(foo bar baz)" with query parsers
|
||||
// that ignores position length attribute
|
||||
expectedQueryWithUnigram= new BooleanQuery.Builder()
|
||||
expectedQueryWithUnigram = new BooleanQuery.Builder().add(
|
||||
new SynonymQuery(new Term("text_shingle_unigram", "foo"), new Term("text_shingle_unigram", "foo bar")),
|
||||
BooleanClause.Occur.SHOULD
|
||||
)
|
||||
.add(
|
||||
new SynonymQuery(
|
||||
new Term("text_shingle_unigram", "foo"),
|
||||
new Term("text_shingle_unigram", "foo bar")
|
||||
), BooleanClause.Occur.SHOULD)
|
||||
.add(
|
||||
new SynonymQuery(
|
||||
new Term("text_shingle_unigram", "bar"),
|
||||
new Term("text_shingle_unigram", "bar baz")
|
||||
), BooleanClause.Occur.SHOULD)
|
||||
.add(
|
||||
new TermQuery(
|
||||
new Term("text_shingle_unigram", "baz")
|
||||
), BooleanClause.Occur.SHOULD)
|
||||
new SynonymQuery(new Term("text_shingle_unigram", "bar"), new Term("text_shingle_unigram", "bar baz")),
|
||||
BooleanClause.Occur.SHOULD
|
||||
)
|
||||
.add(new TermQuery(new Term("text_shingle_unigram", "baz")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
// parsed query for "text_shingle_unigram:\"foo bar baz\" with query parsers
|
||||
// that ignores position length attribute
|
||||
expectedPhraseQueryWithUnigram = new MultiPhraseQuery.Builder()
|
||||
.add(
|
||||
new Term[] {
|
||||
new Term("text_shingle_unigram", "foo"),
|
||||
new Term("text_shingle_unigram", "foo bar")
|
||||
}, 0)
|
||||
.add(
|
||||
new Term[] {
|
||||
new Term("text_shingle_unigram", "bar"),
|
||||
new Term("text_shingle_unigram", "bar baz")
|
||||
}, 1)
|
||||
.add(
|
||||
new Term[] {
|
||||
new Term("text_shingle_unigram", "baz"),
|
||||
}, 2)
|
||||
expectedPhraseQueryWithUnigram = new MultiPhraseQuery.Builder().add(
|
||||
new Term[] { new Term("text_shingle_unigram", "foo"), new Term("text_shingle_unigram", "foo bar") },
|
||||
0
|
||||
)
|
||||
.add(new Term[] { new Term("text_shingle_unigram", "bar"), new Term("text_shingle_unigram", "bar baz") }, 1)
|
||||
.add(new Term[] { new Term("text_shingle_unigram", "baz"), }, 2)
|
||||
.build();
|
||||
|
||||
// parsed query for "text_shingle:(foo bar baz)
|
||||
expectedQuery = new BooleanQuery.Builder()
|
||||
.add(
|
||||
new TermQuery(new Term("text_shingle", "foo bar")),
|
||||
BooleanClause.Occur.SHOULD
|
||||
)
|
||||
.add(
|
||||
new TermQuery(new Term("text_shingle","bar baz")),
|
||||
BooleanClause.Occur.SHOULD
|
||||
)
|
||||
.add(
|
||||
new TermQuery(new Term("text_shingle","baz biz")),
|
||||
BooleanClause.Occur.SHOULD
|
||||
)
|
||||
expectedQuery = new BooleanQuery.Builder().add(new TermQuery(new Term("text_shingle", "foo bar")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("text_shingle", "bar baz")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("text_shingle", "baz biz")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
// parsed query for "text_shingle:"foo bar baz"
|
||||
expectedPhraseQuery = new PhraseQuery.Builder()
|
||||
.add(
|
||||
new Term("text_shingle", "foo bar")
|
||||
)
|
||||
.add(
|
||||
new Term("text_shingle","bar baz")
|
||||
)
|
||||
.add(
|
||||
new Term("text_shingle","baz biz")
|
||||
)
|
||||
expectedPhraseQuery = new PhraseQuery.Builder().add(new Term("text_shingle", "foo bar"))
|
||||
.add(new Term("text_shingle", "bar baz"))
|
||||
.add(new Term("text_shingle", "baz biz"))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
@ -176,20 +149,17 @@ public class DisableGraphQueryTests extends OpenSearchSingleNodeTestCase {
|
|||
}
|
||||
|
||||
public void testMatchPhraseQuery() throws IOException {
|
||||
MatchPhraseQueryBuilder builder =
|
||||
new MatchPhraseQueryBuilder("text_shingle_unigram", "foo bar baz");
|
||||
MatchPhraseQueryBuilder builder = new MatchPhraseQueryBuilder("text_shingle_unigram", "foo bar baz");
|
||||
Query query = builder.toQuery(shardContext);
|
||||
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
|
||||
|
||||
builder =
|
||||
new MatchPhraseQueryBuilder("text_shingle", "foo bar baz biz");
|
||||
builder = new MatchPhraseQueryBuilder("text_shingle", "foo bar baz biz");
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedPhraseQuery, equalTo(query));
|
||||
}
|
||||
|
||||
public void testMatchQuery() throws IOException {
|
||||
MatchQueryBuilder builder =
|
||||
new MatchQueryBuilder("text_shingle_unigram", "foo bar baz");
|
||||
MatchQueryBuilder builder = new MatchQueryBuilder("text_shingle_unigram", "foo bar baz");
|
||||
Query query = builder.toQuery(shardContext);
|
||||
assertThat(expectedQueryWithUnigram, equalTo(query));
|
||||
|
||||
|
@ -199,8 +169,7 @@ public class DisableGraphQueryTests extends OpenSearchSingleNodeTestCase {
|
|||
}
|
||||
|
||||
public void testMultiMatchQuery() throws IOException {
|
||||
MultiMatchQueryBuilder builder = new MultiMatchQueryBuilder("foo bar baz",
|
||||
"text_shingle_unigram");
|
||||
MultiMatchQueryBuilder builder = new MultiMatchQueryBuilder("foo bar baz", "text_shingle_unigram");
|
||||
Query query = builder.toQuery(shardContext);
|
||||
assertThat(expectedQueryWithUnigram, equalTo(query));
|
||||
|
||||
|
|
|
@ -52,10 +52,11 @@ public class EdgeNGramTokenFilterFactoryTests extends OpenSearchTokenStreamTestC
|
|||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_edge_ngram.type", "edge_ngram")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_edge_ngram");
|
||||
String source = "foo";
|
||||
String[] expected = new String[]{"f", "fo"};
|
||||
String[] expected = new String[] { "f", "fo" };
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -68,10 +69,11 @@ public class EdgeNGramTokenFilterFactoryTests extends OpenSearchTokenStreamTestC
|
|||
.put("index.analysis.filter.my_edge_ngram.type", "edge_ngram")
|
||||
.put("index.analysis.filter.my_edge_ngram.preserve_original", true)
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_edge_ngram");
|
||||
String source = "foo";
|
||||
String[] expected = new String[]{"f", "fo", "foo"};
|
||||
String[] expected = new String[] { "f", "fo", "foo" };
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
|
|
@ -55,39 +55,44 @@ import java.util.Collections;
|
|||
public class EdgeNGramTokenizerTests extends OpenSearchTokenStreamTestCase {
|
||||
|
||||
private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
||||
Settings indexSettings = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, version)
|
||||
.put("index.analysis.analyzer.my_analyzer.tokenizer", tokenizer)
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||
return new AnalysisModule(TestEnvironment.newEnvironment(settings),
|
||||
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
|
||||
return new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(new CommonAnalysisPlugin()))
|
||||
.getAnalysisRegistry()
|
||||
.build(idxSettings);
|
||||
}
|
||||
|
||||
public void testPreConfiguredTokenizer() throws IOException {
|
||||
|
||||
// Before 7.3 we return ngrams of length 1 only
|
||||
{
|
||||
Version version = VersionUtils.randomVersionBetween(random(), LegacyESVersion.fromString("7.0.0"),
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.fromString("7.3.0")));
|
||||
Version version = VersionUtils.randomVersionBetween(
|
||||
random(),
|
||||
LegacyESVersion.fromString("7.0.0"),
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.fromString("7.3.0"))
|
||||
);
|
||||
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edge_ngram")) {
|
||||
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
|
||||
assertNotNull(analyzer);
|
||||
assertAnalyzesTo(analyzer, "test", new String[]{"t"});
|
||||
assertAnalyzesTo(analyzer, "test", new String[] { "t" });
|
||||
}
|
||||
}
|
||||
|
||||
// Check deprecated name as well
|
||||
{
|
||||
Version version = VersionUtils.randomVersionBetween(random(), LegacyESVersion.fromString("7.0.0"),
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.fromString("7.3.0")));
|
||||
Version version = VersionUtils.randomVersionBetween(
|
||||
random(),
|
||||
LegacyESVersion.fromString("7.0.0"),
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.fromString("7.3.0"))
|
||||
);
|
||||
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edgeNGram")) {
|
||||
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
|
||||
assertNotNull(analyzer);
|
||||
assertAnalyzesTo(analyzer, "test", new String[]{"t"});
|
||||
assertAnalyzesTo(analyzer, "test", new String[] { "t" });
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,18 +101,21 @@ public class EdgeNGramTokenizerTests extends OpenSearchTokenStreamTestCase {
|
|||
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edge_ngram")) {
|
||||
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
|
||||
assertNotNull(analyzer);
|
||||
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});
|
||||
assertAnalyzesTo(analyzer, "test", new String[] { "t", "te" });
|
||||
}
|
||||
}
|
||||
|
||||
// Check deprecated name as well, needs version before 8.0 because throws IAE after that
|
||||
{
|
||||
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(
|
||||
try (
|
||||
IndexAnalyzers indexAnalyzers = buildAnalyzers(
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.fromString("7.3.0"), Version.CURRENT),
|
||||
"edgeNGram")) {
|
||||
"edgeNGram"
|
||||
)
|
||||
) {
|
||||
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
|
||||
assertNotNull(analyzer);
|
||||
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});
|
||||
assertAnalyzesTo(analyzer, "test", new String[] { "t", "te" });
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -119,12 +127,19 @@ public class EdgeNGramTokenizerTests extends OpenSearchTokenStreamTestCase {
|
|||
final String name = "engr";
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();
|
||||
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
|
||||
.putList("token_chars", "letter", "custom").put("custom_token_chars","_-").build();
|
||||
Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name,
|
||||
settings).create();
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2)
|
||||
.put("max_gram", 3)
|
||||
.putList("token_chars", "letter", "custom")
|
||||
.put("custom_token_chars", "_-")
|
||||
.build();
|
||||
Tokenizer tokenizer = new EdgeNGramTokenizerFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
name,
|
||||
settings
|
||||
).create();
|
||||
tokenizer.setReader(new StringReader("Abc -gh _jk =lm"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"Ab", "Abc", "-g", "-gh", "_j", "_jk", "lm"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "Ab", "Abc", "-g", "-gh", "_j", "_jk", "lm" });
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -47,8 +47,10 @@ public class ElisionFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()));
|
||||
IllegalArgumentException e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin())
|
||||
);
|
||||
|
||||
assertEquals("elision filter requires [articles] or [articles_path] setting", e.getMessage());
|
||||
}
|
||||
|
|
|
@ -40,34 +40,27 @@ public class FingerprintAnalyzerTests extends OpenSearchTokenStreamTestCase {
|
|||
|
||||
public void testFingerprint() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
|
||||
assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO",
|
||||
new String[]{"bar baz foo"});
|
||||
assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO", new String[] { "bar baz foo" });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
|
||||
assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO",
|
||||
new String[]{"bar baz foo"});
|
||||
assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc",
|
||||
new String[]{"123.2 abc xyz"});
|
||||
assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO", new String[] { "bar baz foo" });
|
||||
assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc", new String[] { "123.2 abc xyz" });
|
||||
}
|
||||
|
||||
public void testAsciifolding() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
|
||||
assertAnalyzesTo(a, "gödel escher bach",
|
||||
new String[]{"bach escher godel"});
|
||||
assertAnalyzesTo(a, "gödel escher bach", new String[] { "bach escher godel" });
|
||||
|
||||
assertAnalyzesTo(a, "gödel godel escher bach",
|
||||
new String[]{"bach escher godel"});
|
||||
assertAnalyzesTo(a, "gödel godel escher bach", new String[] { "bach escher godel" });
|
||||
}
|
||||
|
||||
public void testLimit() throws Exception {
|
||||
Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3);
|
||||
assertAnalyzesTo(a, "e d c b a",
|
||||
new String[]{});
|
||||
assertAnalyzesTo(a, "e d c b a", new String[] {});
|
||||
|
||||
assertAnalyzesTo(a, "b a",
|
||||
new String[]{"a b"});
|
||||
assertAnalyzesTo(a, "b a", new String[] { "a b" });
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -54,27 +54,32 @@ public class FlattenGraphTokenFilterFactoryTests extends OpenSearchTokenStreamTe
|
|||
Settings settings = newAnalysisSettingsBuilder().build();
|
||||
|
||||
// "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
|
||||
TokenStream in = new CannedTokenStream(0, 12, new Token[] {
|
||||
token("wtf", 1, 5, 0, 3),
|
||||
token("what", 0, 1, 0, 3),
|
||||
token("wow", 0, 3, 0, 3),
|
||||
token("the", 1, 1, 0, 3),
|
||||
token("fudge", 1, 3, 0, 3),
|
||||
token("that's", 1, 1, 0, 3),
|
||||
token("funny", 1, 1, 0, 3),
|
||||
token("happened", 1, 1, 4, 12)
|
||||
});
|
||||
TokenStream in = new CannedTokenStream(
|
||||
0,
|
||||
12,
|
||||
new Token[] {
|
||||
token("wtf", 1, 5, 0, 3),
|
||||
token("what", 0, 1, 0, 3),
|
||||
token("wow", 0, 3, 0, 3),
|
||||
token("the", 1, 1, 0, 3),
|
||||
token("fudge", 1, 3, 0, 3),
|
||||
token("that's", 1, 1, 0, 3),
|
||||
token("funny", 1, 1, 0, 3),
|
||||
token("happened", 1, 1, 4, 12) }
|
||||
);
|
||||
|
||||
TokenStream tokens = new FlattenGraphTokenFilterFactory(indexProperties, null, name, settings).create(in);
|
||||
|
||||
// ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
|
||||
assertTokenStreamContents(tokens,
|
||||
new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
|
||||
new int[] {0, 0, 0, 0, 0, 0, 0, 4},
|
||||
new int[] {3, 3, 3, 3, 3, 3, 3, 12},
|
||||
new int[] {1, 0, 0, 1, 0, 1, 0, 1},
|
||||
new int[] {3, 1, 1, 1, 1, 1, 1, 1},
|
||||
12);
|
||||
assertTokenStreamContents(
|
||||
tokens,
|
||||
new String[] { "wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened" },
|
||||
new int[] { 0, 0, 0, 0, 0, 0, 0, 4 },
|
||||
new int[] { 3, 3, 3, 3, 3, 3, 3, 12 },
|
||||
new int[] { 1, 0, 0, 1, 0, 1, 0, 1 },
|
||||
new int[] { 3, 1, 1, 1, 1, 1, 1, 1 },
|
||||
12
|
||||
);
|
||||
}
|
||||
|
||||
private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
|
||||
|
|
|
@ -69,26 +69,29 @@ public class HighlighterWithAnalyzersTests extends OpenSearchIntegTestCase {
|
|||
}
|
||||
|
||||
public void testNgramHighlightingWithBrokenPositions() throws IOException {
|
||||
assertAcked(prepareCreate("test")
|
||||
.addMapping("test", jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("test")
|
||||
.startObject("properties")
|
||||
.startObject("name")
|
||||
.field("type", "text")
|
||||
.startObject("fields")
|
||||
.startObject("autocomplete")
|
||||
.field("type", "text")
|
||||
.field("analyzer", "autocomplete")
|
||||
.field("search_analyzer", "search_autocomplete")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject())
|
||||
.setSettings(Settings.builder()
|
||||
assertAcked(
|
||||
prepareCreate("test").addMapping(
|
||||
"test",
|
||||
jsonBuilder().startObject()
|
||||
.startObject("test")
|
||||
.startObject("properties")
|
||||
.startObject("name")
|
||||
.field("type", "text")
|
||||
.startObject("fields")
|
||||
.startObject("autocomplete")
|
||||
.field("type", "text")
|
||||
.field("analyzer", "autocomplete")
|
||||
.field("search_analyzer", "search_autocomplete")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
)
|
||||
.setSettings(
|
||||
Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 19)
|
||||
.put("analysis.tokenizer.autocomplete.max_gram", 20)
|
||||
|
@ -96,15 +99,31 @@ public class HighlighterWithAnalyzersTests extends OpenSearchIntegTestCase {
|
|||
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
|
||||
.put("analysis.tokenizer.autocomplete.type", "ngram")
|
||||
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||
.putList("analysis.filter.wordDelimiter.type_table",
|
||||
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
|
||||
"? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM",
|
||||
"# => ALPHANUM", "% => ALPHANUM", "+ => ALPHANUM",
|
||||
", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM",
|
||||
"/ => ALPHANUM", "^ => ALPHANUM", "$ => ALPHANUM",
|
||||
"@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
|
||||
"] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM",
|
||||
"{ => ALPHANUM")
|
||||
.putList(
|
||||
"analysis.filter.wordDelimiter.type_table",
|
||||
"& => ALPHANUM",
|
||||
"| => ALPHANUM",
|
||||
"! => ALPHANUM",
|
||||
"? => ALPHANUM",
|
||||
". => ALPHANUM",
|
||||
"- => ALPHANUM",
|
||||
"# => ALPHANUM",
|
||||
"% => ALPHANUM",
|
||||
"+ => ALPHANUM",
|
||||
", => ALPHANUM",
|
||||
"~ => ALPHANUM",
|
||||
": => ALPHANUM",
|
||||
"/ => ALPHANUM",
|
||||
"^ => ALPHANUM",
|
||||
"$ => ALPHANUM",
|
||||
"@ => ALPHANUM",
|
||||
") => ALPHANUM",
|
||||
"( => ALPHANUM",
|
||||
"] => ALPHANUM",
|
||||
"[ => ALPHANUM",
|
||||
"} => ALPHANUM",
|
||||
"{ => ALPHANUM"
|
||||
)
|
||||
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
||||
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.generate_number_parts", false)
|
||||
|
@ -113,19 +132,19 @@ public class HighlighterWithAnalyzersTests extends OpenSearchIntegTestCase {
|
|||
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
||||
|
||||
.put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
|
||||
.putList("analysis.analyzer.autocomplete.filter",
|
||||
"lowercase", "wordDelimiter")
|
||||
.putList("analysis.analyzer.autocomplete.filter", "lowercase", "wordDelimiter")
|
||||
.put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
|
||||
.putList("analysis.analyzer.search_autocomplete.filter",
|
||||
"lowercase", "wordDelimiter")));
|
||||
client().prepareIndex("test", "test", "1")
|
||||
.setSource("name", "ARCOTEL Hotels Deutschland").get();
|
||||
.putList("analysis.analyzer.search_autocomplete.filter", "lowercase", "wordDelimiter")
|
||||
)
|
||||
);
|
||||
client().prepareIndex("test", "test", "1").setSource("name", "ARCOTEL Hotels Deutschland").get();
|
||||
refresh();
|
||||
SearchResponse search = client().prepareSearch("test").setTypes("test")
|
||||
.setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
|
||||
.highlighter(new HighlightBuilder().field("name.autocomplete")).get();
|
||||
assertHighlight(search, 0, "name.autocomplete", 0,
|
||||
equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
|
||||
SearchResponse search = client().prepareSearch("test")
|
||||
.setTypes("test")
|
||||
.setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
|
||||
.highlighter(new HighlightBuilder().field("name.autocomplete"))
|
||||
.get();
|
||||
assertHighlight(search, 0, "name.autocomplete", 0, equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
|
||||
}
|
||||
|
||||
public void testMultiPhraseCutoff() throws IOException {
|
||||
|
@ -133,48 +152,70 @@ public class HighlighterWithAnalyzersTests extends OpenSearchIntegTestCase {
|
|||
* MultiPhraseQuery can literally kill an entire node if there are too many terms in the
|
||||
* query. We cut off and extract terms if there are more than 16 terms in the query
|
||||
*/
|
||||
assertAcked(prepareCreate("test")
|
||||
.addMapping("test", "body", "type=text,analyzer=custom_analyzer,"
|
||||
+ "search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
|
||||
assertAcked(
|
||||
prepareCreate("test").addMapping(
|
||||
"test",
|
||||
"body",
|
||||
"type=text,analyzer=custom_analyzer," + "search_analyzer=custom_analyzer,term_vector=with_positions_offsets"
|
||||
)
|
||||
.setSettings(
|
||||
Settings.builder().put(indexSettings())
|
||||
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
||||
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.generate_number_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_words", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
||||
.put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
||||
.putList("analysis.analyzer.custom_analyzer.filter",
|
||||
"lowercase", "wordDelimiter"))
|
||||
Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
||||
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.generate_number_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_words", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
||||
.put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
||||
.putList("analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
|
||||
)
|
||||
);
|
||||
|
||||
ensureGreen();
|
||||
client().prepareIndex("test", "test", "1")
|
||||
.setSource("body", "Test: http://www.facebook.com http://elasticsearch.org "
|
||||
.setSource(
|
||||
"body",
|
||||
"Test: http://www.facebook.com http://elasticsearch.org "
|
||||
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this is "
|
||||
+ "a test for highlighting feature Test: http://www.facebook.com "
|
||||
+ "http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
|
||||
+ "http://twitter.com this is a test for highlighting feature")
|
||||
+ "http://twitter.com this is a test for highlighting feature"
|
||||
)
|
||||
.get();
|
||||
refresh();
|
||||
SearchResponse search = client().prepareSearch()
|
||||
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
|
||||
.highlighter(new HighlightBuilder().field("body").highlighterType("fvh")).get();
|
||||
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
|
||||
.highlighter(new HighlightBuilder().field("body").highlighterType("fvh"))
|
||||
.get();
|
||||
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
|
||||
search = client()
|
||||
.prepareSearch()
|
||||
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "
|
||||
search = client().prepareSearch()
|
||||
.setQuery(
|
||||
matchPhraseQuery(
|
||||
"body",
|
||||
"Test: http://www.facebook.com "
|
||||
+ "http://elasticsearch.org http://xing.com http://cnn.com "
|
||||
+ "http://quora.com http://twitter.com this is a test for highlighting "
|
||||
+ "feature Test: http://www.facebook.com http://elasticsearch.org "
|
||||
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this "
|
||||
+ "is a test for highlighting feature"))
|
||||
.highlighter(new HighlightBuilder().field("body").highlighterType("fvh")).execute().actionGet();
|
||||
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: "
|
||||
+ "<em>http://www.facebook.com</em> <em>http://elasticsearch.org</em> "
|
||||
+ "<em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
|
||||
+ "is a test for highlighting feature"
|
||||
)
|
||||
)
|
||||
.highlighter(new HighlightBuilder().field("body").highlighterType("fvh"))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertHighlight(
|
||||
search,
|
||||
0,
|
||||
"body",
|
||||
0,
|
||||
equalTo(
|
||||
"<em>Test</em>: "
|
||||
+ "<em>http://www.facebook.com</em> <em>http://elasticsearch.org</em> "
|
||||
+ "<em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
public void testSynonyms() throws IOException {
|
||||
|
@ -185,36 +226,29 @@ public class HighlighterWithAnalyzersTests extends OpenSearchIntegTestCase {
|
|||
.put("index.analysis.filter.synonym.type", "synonym")
|
||||
.putList("index.analysis.filter.synonym.synonyms", "fast,quick");
|
||||
|
||||
assertAcked(prepareCreate("test").setSettings(builder.build())
|
||||
.addMapping("type1", "field1",
|
||||
"type=text,term_vector=with_positions_offsets,search_analyzer=synonym," +
|
||||
"analyzer=standard,index_options=offsets"));
|
||||
assertAcked(
|
||||
prepareCreate("test").setSettings(builder.build())
|
||||
.addMapping(
|
||||
"type1",
|
||||
"field1",
|
||||
"type=text,term_vector=with_positions_offsets,search_analyzer=synonym," + "analyzer=standard,index_options=offsets"
|
||||
)
|
||||
);
|
||||
ensureGreen();
|
||||
|
||||
client().prepareIndex("test", "type1", "0").setSource(
|
||||
"field1", "The quick brown fox jumps over the lazy dog").get();
|
||||
client().prepareIndex("test", "type1", "0").setSource("field1", "The quick brown fox jumps over the lazy dog").get();
|
||||
refresh();
|
||||
for (String highlighterType : new String[] {"plain", "fvh", "unified"}) {
|
||||
for (String highlighterType : new String[] { "plain", "fvh", "unified" }) {
|
||||
logger.info("--> highlighting (type=" + highlighterType + ") and searching on field1");
|
||||
SearchSourceBuilder source = searchSource()
|
||||
.query(matchQuery("field1", "quick brown fox").operator(Operator.AND))
|
||||
.highlighter(
|
||||
highlight()
|
||||
.field("field1")
|
||||
.order("score")
|
||||
.preTags("<x>")
|
||||
.postTags("</x>")
|
||||
.highlighterType(highlighterType));
|
||||
SearchSourceBuilder source = searchSource().query(matchQuery("field1", "quick brown fox").operator(Operator.AND))
|
||||
.highlighter(highlight().field("field1").order("score").preTags("<x>").postTags("</x>").highlighterType(highlighterType));
|
||||
SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet();
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 1,
|
||||
equalTo("The <x>quick</x> <x>brown</x> <x>fox</x> jumps over the lazy dog"));
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The <x>quick</x> <x>brown</x> <x>fox</x> jumps over the lazy dog"));
|
||||
|
||||
source = searchSource()
|
||||
.query(matchQuery("field1", "fast brown fox").operator(Operator.AND))
|
||||
source = searchSource().query(matchQuery("field1", "fast brown fox").operator(Operator.AND))
|
||||
.highlighter(highlight().field("field1").order("score").preTags("<x>").postTags("</x>"));
|
||||
searchResponse = client().search(searchRequest("test").source(source)).actionGet();
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 1,
|
||||
equalTo("The <x>quick</x> <x>brown</x> <x>fox</x> jumps over the lazy dog"));
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The <x>quick</x> <x>brown</x> <x>fox</x> jumps over the lazy dog"));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -230,93 +264,139 @@ public class HighlighterWithAnalyzersTests extends OpenSearchIntegTestCase {
|
|||
|
||||
ensureGreen();
|
||||
|
||||
client().prepareIndex("first_test_index", "type1", "0").setSource(
|
||||
"field0", "The quick brown fox jumps over the lazy dog",
|
||||
"field1", "The quick brown fox jumps over the lazy dog").get();
|
||||
client().prepareIndex("first_test_index", "type1", "1").setSource("field1",
|
||||
"The quick browse button is a fancy thing, right bro?").get();
|
||||
client().prepareIndex("first_test_index", "type1", "0")
|
||||
.setSource("field0", "The quick brown fox jumps over the lazy dog", "field1", "The quick brown fox jumps over the lazy dog")
|
||||
.get();
|
||||
client().prepareIndex("first_test_index", "type1", "1")
|
||||
.setSource("field1", "The quick browse button is a fancy thing, right bro?")
|
||||
.get();
|
||||
refresh();
|
||||
logger.info("--> highlighting and searching on field0");
|
||||
|
||||
SearchSourceBuilder source = searchSource()
|
||||
.query(matchPhrasePrefixQuery("field0", "bro"))
|
||||
SearchSourceBuilder source = searchSource().query(matchPhrasePrefixQuery("field0", "bro"))
|
||||
.highlighter(highlight().field("field0").order("score").preTags("<x>").postTags("</x>"));
|
||||
SearchResponse searchResponse = client().search(searchRequest("first_test_index").source(source)).actionGet();
|
||||
|
||||
assertHighlight(searchResponse, 0, "field0", 0, 1, equalTo("The quick <x>brown</x> fox jumps over the lazy dog"));
|
||||
|
||||
source = searchSource()
|
||||
.query(matchPhrasePrefixQuery("field0", "quick bro"))
|
||||
source = searchSource().query(matchPhrasePrefixQuery("field0", "quick bro"))
|
||||
.highlighter(highlight().field("field0").order("score").preTags("<x>").postTags("</x>"));
|
||||
|
||||
searchResponse = client().search(searchRequest("first_test_index").source(source)).actionGet();
|
||||
assertHighlight(searchResponse, 0, "field0", 0, 1,
|
||||
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
|
||||
assertHighlight(searchResponse, 0, "field0", 0, 1, equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
|
||||
|
||||
logger.info("--> highlighting and searching on field1");
|
||||
source = searchSource()
|
||||
.query(boolQuery()
|
||||
.should(matchPhrasePrefixQuery("field1", "test"))
|
||||
.should(matchPhrasePrefixQuery("field1", "bro"))
|
||||
)
|
||||
.highlighter(highlight().field("field1").order("score").preTags("<x>").postTags("</x>"));
|
||||
source = searchSource().query(
|
||||
boolQuery().should(matchPhrasePrefixQuery("field1", "test")).should(matchPhrasePrefixQuery("field1", "bro"))
|
||||
).highlighter(highlight().field("field1").order("score").preTags("<x>").postTags("</x>"));
|
||||
|
||||
searchResponse = client().search(searchRequest("first_test_index").source(source)).actionGet();
|
||||
assertThat(searchResponse.getHits().getTotalHits().value, equalTo(2L));
|
||||
for (int i = 0; i < 2; i++) {
|
||||
assertHighlight(searchResponse, i, "field1", 0, 1, anyOf(
|
||||
equalTo("The quick <x>browse</x> button is a fancy thing, right <x>bro</x>?"),
|
||||
equalTo("The quick <x>brown</x> fox jumps over the lazy dog")));
|
||||
assertHighlight(
|
||||
searchResponse,
|
||||
i,
|
||||
"field1",
|
||||
0,
|
||||
1,
|
||||
anyOf(
|
||||
equalTo("The quick <x>browse</x> button is a fancy thing, right <x>bro</x>?"),
|
||||
equalTo("The quick <x>brown</x> fox jumps over the lazy dog")
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
source = searchSource()
|
||||
.query(matchPhrasePrefixQuery("field1", "quick bro"))
|
||||
source = searchSource().query(matchPhrasePrefixQuery("field1", "quick bro"))
|
||||
.highlighter(highlight().field("field1").order("score").preTags("<x>").postTags("</x>"));
|
||||
|
||||
searchResponse = client().search(searchRequest("first_test_index").source(source)).actionGet();
|
||||
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf(
|
||||
equalTo("The <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
|
||||
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")));
|
||||
assertHighlight(searchResponse, 1, "field1", 0, 1, anyOf(
|
||||
equalTo("The <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
|
||||
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")));
|
||||
assertHighlight(
|
||||
searchResponse,
|
||||
0,
|
||||
"field1",
|
||||
0,
|
||||
1,
|
||||
anyOf(
|
||||
equalTo("The <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
|
||||
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
|
||||
)
|
||||
);
|
||||
assertHighlight(
|
||||
searchResponse,
|
||||
1,
|
||||
"field1",
|
||||
0,
|
||||
1,
|
||||
anyOf(
|
||||
equalTo("The <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
|
||||
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
|
||||
)
|
||||
);
|
||||
|
||||
assertAcked(prepareCreate("second_test_index").setSettings(builder.build()).addMapping("doc",
|
||||
"field4", "type=text,term_vector=with_positions_offsets,analyzer=synonym",
|
||||
"field3", "type=text,analyzer=synonym"));
|
||||
assertAcked(
|
||||
prepareCreate("second_test_index").setSettings(builder.build())
|
||||
.addMapping(
|
||||
"doc",
|
||||
"field4",
|
||||
"type=text,term_vector=with_positions_offsets,analyzer=synonym",
|
||||
"field3",
|
||||
"type=text,analyzer=synonym"
|
||||
)
|
||||
);
|
||||
// with synonyms
|
||||
client().prepareIndex("second_test_index", "doc", "0").setSource(
|
||||
"type", "type2",
|
||||
"field4", "The quick brown fox jumps over the lazy dog",
|
||||
"field3", "The quick brown fox jumps over the lazy dog").get();
|
||||
client().prepareIndex("second_test_index", "doc", "1").setSource(
|
||||
"type", "type2",
|
||||
"field4", "The quick browse button is a fancy thing, right bro?").get();
|
||||
client().prepareIndex("second_test_index", "doc", "2").setSource(
|
||||
"type", "type2",
|
||||
"field4", "a quick fast blue car").get();
|
||||
client().prepareIndex("second_test_index", "doc", "0")
|
||||
.setSource(
|
||||
"type",
|
||||
"type2",
|
||||
"field4",
|
||||
"The quick brown fox jumps over the lazy dog",
|
||||
"field3",
|
||||
"The quick brown fox jumps over the lazy dog"
|
||||
)
|
||||
.get();
|
||||
client().prepareIndex("second_test_index", "doc", "1")
|
||||
.setSource("type", "type2", "field4", "The quick browse button is a fancy thing, right bro?")
|
||||
.get();
|
||||
client().prepareIndex("second_test_index", "doc", "2").setSource("type", "type2", "field4", "a quick fast blue car").get();
|
||||
refresh();
|
||||
|
||||
source = searchSource().postFilter(termQuery("type", "type2")).query(matchPhrasePrefixQuery("field3", "fast bro"))
|
||||
source = searchSource().postFilter(termQuery("type", "type2"))
|
||||
.query(matchPhrasePrefixQuery("field3", "fast bro"))
|
||||
.highlighter(highlight().field("field3").order("score").preTags("<x>").postTags("</x>"));
|
||||
|
||||
searchResponse = client().search(searchRequest("second_test_index").source(source)).actionGet();
|
||||
|
||||
assertHighlight(searchResponse, 0, "field3", 0, 1,
|
||||
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
|
||||
assertHighlight(searchResponse, 0, "field3", 0, 1, equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
|
||||
|
||||
logger.info("--> highlighting and searching on field4");
|
||||
source = searchSource().postFilter(termQuery("type", "type2")).query(matchPhrasePrefixQuery("field4", "the fast bro"))
|
||||
source = searchSource().postFilter(termQuery("type", "type2"))
|
||||
.query(matchPhrasePrefixQuery("field4", "the fast bro"))
|
||||
.highlighter(highlight().field("field4").order("score").preTags("<x>").postTags("</x>"));
|
||||
searchResponse = client().search(searchRequest("second_test_index").source(source)).actionGet();
|
||||
|
||||
assertHighlight(searchResponse, 0, "field4", 0, 1, anyOf(
|
||||
equalTo("<x>The</x> <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
|
||||
equalTo("<x>The</x> <x>quick</x> <x>brown</x> fox jumps over the lazy dog")));
|
||||
assertHighlight(searchResponse, 1, "field4", 0, 1, anyOf(
|
||||
equalTo("<x>The</x> <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
|
||||
equalTo("<x>The</x> <x>quick</x> <x>brown</x> fox jumps over the lazy dog")));
|
||||
assertHighlight(
|
||||
searchResponse,
|
||||
0,
|
||||
"field4",
|
||||
0,
|
||||
1,
|
||||
anyOf(
|
||||
equalTo("<x>The</x> <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
|
||||
equalTo("<x>The</x> <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
|
||||
)
|
||||
);
|
||||
assertHighlight(
|
||||
searchResponse,
|
||||
1,
|
||||
"field4",
|
||||
0,
|
||||
1,
|
||||
anyOf(
|
||||
equalTo("<x>The</x> <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
|
||||
equalTo("<x>The</x> <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
|
||||
)
|
||||
);
|
||||
|
||||
logger.info("--> highlighting and searching on field4");
|
||||
source = searchSource().postFilter(termQuery("type", "type2"))
|
||||
|
@ -324,17 +404,31 @@ public class HighlighterWithAnalyzersTests extends OpenSearchIntegTestCase {
|
|||
.highlighter(highlight().field("field4").order("score").preTags("<x>").postTags("</x>"));
|
||||
searchResponse = client().search(searchRequest("second_test_index").source(source)).actionGet();
|
||||
|
||||
assertHighlight(searchResponse, 0, "field4", 0, 1,
|
||||
anyOf(equalTo("<x>a quick fast blue car</x>"),
|
||||
equalTo("<x>a</x> <x>quick</x> <x>fast</x> <x>blue</x> <x>car</x>")));
|
||||
assertHighlight(
|
||||
searchResponse,
|
||||
0,
|
||||
"field4",
|
||||
0,
|
||||
1,
|
||||
anyOf(equalTo("<x>a quick fast blue car</x>"), equalTo("<x>a</x> <x>quick</x> <x>fast</x> <x>blue</x> <x>car</x>"))
|
||||
);
|
||||
}
|
||||
|
||||
public static XContentBuilder type1TermVectorMapping() throws IOException {
|
||||
return XContentFactory.jsonBuilder().startObject().startObject("type1")
|
||||
return XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("type1")
|
||||
.startObject("properties")
|
||||
.startObject("field1").field("type", "text").field("term_vector", "with_positions_offsets").endObject()
|
||||
.startObject("field2").field("type", "text").field("term_vector", "with_positions_offsets").endObject()
|
||||
.startObject("field1")
|
||||
.field("type", "text")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.endObject()
|
||||
.endObject().endObject();
|
||||
.startObject("field2")
|
||||
.field("type", "text")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,25 +47,29 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
public class HtmlStripCharFilterFactoryTests extends OpenSearchTestCase {
|
||||
|
||||
/**
|
||||
* Check that the deprecated name "htmlStrip" issues a deprecation warning for indices created since 6.3.0
|
||||
*/
|
||||
public void testDeprecationWarning() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_6_3_0, Version.CURRENT))
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_6_3_0, Version.CURRENT)
|
||||
)
|
||||
.build();
|
||||
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
|
||||
Map<String, CharFilterFactory> charFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).charFilter;
|
||||
CharFilterFactory charFilterFactory = charFilters.get("htmlStrip");
|
||||
assertNotNull(charFilterFactory.create(new StringReader("input")));
|
||||
assertWarnings("The [htmpStrip] char filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [html_strip] instead.");
|
||||
assertWarnings(
|
||||
"The [htmpStrip] char filter name is deprecated and will be removed in a future version. "
|
||||
+ "Please change the filter name to [html_strip] instead."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -73,10 +77,13 @@ public class HtmlStripCharFilterFactoryTests extends OpenSearchTestCase {
|
|||
* Check that the deprecated name "htmlStrip" does NOT issues a deprecation warning for indices created before 6.3.0
|
||||
*/
|
||||
public void testNoDeprecationWarningPre6_3() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_6_0_0, LegacyESVersion.V_6_2_4))
|
||||
.build();
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_6_0_0, LegacyESVersion.V_6_2_4)
|
||||
)
|
||||
.build();
|
||||
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
|
||||
|
|
|
@ -52,51 +52,49 @@ public class KeepFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
|
||||
public void testLoadWithoutSettings() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(
|
||||
createTempDir(), RESOURCE, new CommonAnalysisPlugin());
|
||||
createTempDir(),
|
||||
RESOURCE,
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep");
|
||||
Assert.assertNull(tokenFilter);
|
||||
}
|
||||
|
||||
public void testLoadOverConfiguredSettings() {
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.broken_keep_filter.type", "keep")
|
||||
.put("index.analysis.filter.broken_keep_filter.keep_words_path", "does/not/exists.txt")
|
||||
.put("index.analysis.filter.broken_keep_filter.keep_words", "[\"Hello\", \"worlD\"]")
|
||||
.build();
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.broken_keep_filter.type", "keep")
|
||||
.put("index.analysis.filter.broken_keep_filter.keep_words_path", "does/not/exists.txt")
|
||||
.put("index.analysis.filter.broken_keep_filter.keep_words", "[\"Hello\", \"worlD\"]")
|
||||
.build();
|
||||
try {
|
||||
AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
Assert.fail("path and array are configured");
|
||||
} catch (IllegalArgumentException e) {
|
||||
} catch (IOException e) {
|
||||
} catch (IllegalArgumentException e) {} catch (IOException e) {
|
||||
fail("expected IAE");
|
||||
}
|
||||
}
|
||||
|
||||
public void testKeepWordsPathSettings() {
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.non_broken_keep_filter.type", "keep")
|
||||
.put("index.analysis.filter.non_broken_keep_filter.keep_words_path", "does/not/exists.txt")
|
||||
.build();
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.non_broken_keep_filter.type", "keep")
|
||||
.put("index.analysis.filter.non_broken_keep_filter.keep_words_path", "does/not/exists.txt")
|
||||
.build();
|
||||
try {
|
||||
// test our none existing setup is picked up
|
||||
AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
fail("expected an exception due to non existent keep_words_path");
|
||||
} catch (IllegalArgumentException e) {
|
||||
} catch (IOException e) {
|
||||
} catch (IllegalArgumentException e) {} catch (IOException e) {
|
||||
fail("expected IAE");
|
||||
}
|
||||
|
||||
settings = Settings.builder().put(settings)
|
||||
.putList("index.analysis.filter.non_broken_keep_filter.keep_words", "test")
|
||||
.build();
|
||||
settings = Settings.builder().put(settings).putList("index.analysis.filter.non_broken_keep_filter.keep_words", "test").build();
|
||||
try {
|
||||
// test our none existing setup is picked up
|
||||
AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
fail("expected an exception indicating that you can't use [keep_words_path] with [keep_words] ");
|
||||
} catch (IllegalArgumentException e) {
|
||||
} catch (IOException e) {
|
||||
} catch (IllegalArgumentException e) {} catch (IOException e) {
|
||||
fail("expected IAE");
|
||||
}
|
||||
|
||||
|
@ -104,25 +102,31 @@ public class KeepFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
|
||||
public void testCaseInsensitiveMapping() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(
|
||||
createTempDir(), RESOURCE, new CommonAnalysisPlugin());
|
||||
createTempDir(),
|
||||
RESOURCE,
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keep_filter");
|
||||
assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
|
||||
String source = "hello small world";
|
||||
String[] expected = new String[]{"hello", "world"};
|
||||
String[] expected = new String[] { "hello", "world" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1, 2});
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 1, 2 });
|
||||
}
|
||||
|
||||
public void testCaseSensitiveMapping() throws IOException {
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(
|
||||
createTempDir(), RESOURCE, new CommonAnalysisPlugin());
|
||||
createTempDir(),
|
||||
RESOURCE,
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_case_sensitive_keep_filter");
|
||||
assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
|
||||
String source = "Hello small world";
|
||||
String[] expected = new String[]{"Hello"};
|
||||
String[] expected = new String[] { "Hello" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1});
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 1 });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,13 +51,16 @@ public class KeepTypesFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
private static final String BASE_SETTING = "index.analysis.filter.keep_numbers";
|
||||
|
||||
public void testKeepTypesInclude() throws IOException {
|
||||
Settings.Builder settingsBuilder = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put(BASE_SETTING + ".type", "keep_types")
|
||||
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" });
|
||||
Settings.Builder settingsBuilder = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put(BASE_SETTING + ".type", "keep_types")
|
||||
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" });
|
||||
// either use default mode or set "include" mode explicitly
|
||||
if (random().nextBoolean()) {
|
||||
settingsBuilder.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY,
|
||||
KeepTypesFilterFactory.KeepTypesMode.INCLUDE);
|
||||
settingsBuilder.put(
|
||||
BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY,
|
||||
KeepTypesFilterFactory.KeepTypesMode.INCLUDE
|
||||
);
|
||||
}
|
||||
Settings settings = settingsBuilder.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
|
@ -71,10 +74,12 @@ public class KeepTypesFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testKeepTypesExclude() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put(BASE_SETTING + ".type", "keep_types")
|
||||
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
|
||||
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, KeepTypesFilterFactory.KeepTypesMode.EXCLUDE).build();
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put(BASE_SETTING + ".type", "keep_types")
|
||||
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
|
||||
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, KeepTypesFilterFactory.KeepTypesMode.EXCLUDE)
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
|
||||
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
|
||||
|
@ -86,12 +91,16 @@ public class KeepTypesFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testKeepTypesException() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put(BASE_SETTING + ".type", "keep_types")
|
||||
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
|
||||
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, "bad_parameter").build();
|
||||
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
|
||||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()));
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put(BASE_SETTING + ".type", "keep_types")
|
||||
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
|
||||
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, "bad_parameter")
|
||||
.build();
|
||||
IllegalArgumentException ex = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin())
|
||||
);
|
||||
assertEquals("`keep_types` tokenfilter mode can only be [include] or [exclude] but was [bad_parameter].", ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -72,8 +72,7 @@ public class KeywordMarkerFilterFactoryTests extends OpenSearchTokenStreamTestCa
|
|||
assertThat(filter, instanceOf(SetKeywordMarkerFilter.class));
|
||||
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_keyword");
|
||||
// jogging is not part of the keywords set, so verify that its the only stemmed word
|
||||
assertAnalyzesTo(analyzer, "running jogging sleeping",
|
||||
new String[] { "running", "jog", "sleeping" });
|
||||
assertAnalyzesTo(analyzer, "running jogging sleeping", new String[] { "running", "jog", "sleeping" });
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -111,9 +110,10 @@ public class KeywordMarkerFilterFactoryTests extends OpenSearchTokenStreamTestCa
|
|||
.put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()));
|
||||
assertEquals("cannot specify both `keywords_pattern` and `keywords` or `keywords_path`",
|
||||
e.getMessage());
|
||||
IllegalArgumentException e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin())
|
||||
);
|
||||
assertEquals("cannot specify both `keywords_pattern` and `keywords` or `keywords_path`", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,9 +47,9 @@ import java.io.StringReader;
|
|||
public class LimitTokenCountFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
||||
public void testDefault() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.limit_default.type", "limit")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
.put("index.analysis.filter.limit_default.type", "limit")
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
{
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("limit_default");
|
||||
|
@ -72,11 +72,11 @@ public class LimitTokenCountFilterFactoryTests extends OpenSearchTokenStreamTest
|
|||
public void testSettings() throws IOException {
|
||||
{
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.limit_1.type", "limit")
|
||||
.put("index.analysis.filter.limit_1.max_token_count", 3)
|
||||
.put("index.analysis.filter.limit_1.consume_all_tokens", true)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
.put("index.analysis.filter.limit_1.type", "limit")
|
||||
.put("index.analysis.filter.limit_1.max_token_count", 3)
|
||||
.put("index.analysis.filter.limit_1.consume_all_tokens", true)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("limit_1");
|
||||
String source = "the quick brown fox";
|
||||
|
@ -87,11 +87,11 @@ public class LimitTokenCountFilterFactoryTests extends OpenSearchTokenStreamTest
|
|||
}
|
||||
{
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.limit_1.type", "limit")
|
||||
.put("index.analysis.filter.limit_1.max_token_count", 3)
|
||||
.put("index.analysis.filter.limit_1.consume_all_tokens", false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
.put("index.analysis.filter.limit_1.type", "limit")
|
||||
.put("index.analysis.filter.limit_1.max_token_count", 3)
|
||||
.put("index.analysis.filter.limit_1.consume_all_tokens", false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("limit_1");
|
||||
String source = "the quick brown fox";
|
||||
|
@ -103,11 +103,11 @@ public class LimitTokenCountFilterFactoryTests extends OpenSearchTokenStreamTest
|
|||
|
||||
{
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.limit_1.type", "limit")
|
||||
.put("index.analysis.filter.limit_1.max_token_count", 17)
|
||||
.put("index.analysis.filter.limit_1.consume_all_tokens", true)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
.put("index.analysis.filter.limit_1.type", "limit")
|
||||
.put("index.analysis.filter.limit_1.max_token_count", 17)
|
||||
.put("index.analysis.filter.limit_1.consume_all_tokens", true)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = createTestAnalysisFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("limit_1");
|
||||
String source = "the quick brown fox";
|
||||
|
|
|
@ -51,13 +51,18 @@ public class MassiveWordListTests extends OpenSearchSingleNodeTestCase {
|
|||
for (int i = 0; i < wordList.length; i++) {
|
||||
wordList[i] = "hello world";
|
||||
}
|
||||
client().admin().indices().prepareCreate("test").setSettings(Settings.builder()
|
||||
.put("index.number_of_shards", 1)
|
||||
.put("analysis.analyzer.test_analyzer.type", "custom")
|
||||
.put("analysis.analyzer.test_analyzer.tokenizer", "standard")
|
||||
.putList("analysis.analyzer.test_analyzer.filter", "dictionary_decompounder", "lowercase")
|
||||
.put("analysis.filter.dictionary_decompounder.type", "dictionary_decompounder")
|
||||
.putList("analysis.filter.dictionary_decompounder.word_list", wordList)
|
||||
).get();
|
||||
client().admin()
|
||||
.indices()
|
||||
.prepareCreate("test")
|
||||
.setSettings(
|
||||
Settings.builder()
|
||||
.put("index.number_of_shards", 1)
|
||||
.put("analysis.analyzer.test_analyzer.type", "custom")
|
||||
.put("analysis.analyzer.test_analyzer.tokenizer", "standard")
|
||||
.putList("analysis.analyzer.test_analyzer.filter", "dictionary_decompounder", "lowercase")
|
||||
.put("analysis.filter.dictionary_decompounder.type", "dictionary_decompounder")
|
||||
.putList("analysis.filter.dictionary_decompounder.word_list", wordList)
|
||||
)
|
||||
.get();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,9 +49,7 @@ public class MinHashFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
int default_hash_count = 1;
|
||||
int default_bucket_size = 512;
|
||||
int default_hash_set_size = 1;
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
|
||||
String source = "the quick brown fox";
|
||||
|
@ -60,8 +58,7 @@ public class MinHashFilterFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
|
||||
// with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
|
||||
// have enough tokens to fill all the buckets, we still expect 512 tokens.
|
||||
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer),
|
||||
default_hash_count * default_bucket_size * default_hash_set_size);
|
||||
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), default_hash_count * default_bucket_size * default_hash_set_size);
|
||||
}
|
||||
|
||||
public void testSettings() throws IOException {
|
||||
|
|
|
@ -50,9 +50,7 @@ import java.util.Collections;
|
|||
public class MultiplexerTokenFilterTests extends OpenSearchTokenStreamTestCase {
|
||||
|
||||
public void testMultiplexingFilter() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
||||
Settings indexSettings = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("index.analysis.filter.t.type", "truncate")
|
||||
|
@ -65,30 +63,27 @@ public class MultiplexerTokenFilterTests extends OpenSearchTokenStreamTestCase {
|
|||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||
|
||||
IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
|
||||
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
|
||||
IndexAnalyzers indexAnalyzers = new AnalysisModule(
|
||||
TestEnvironment.newEnvironment(settings),
|
||||
Collections.singletonList(new CommonAnalysisPlugin())
|
||||
).getAnalysisRegistry().build(idxSettings);
|
||||
|
||||
try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) {
|
||||
assertNotNull(analyzer);
|
||||
assertAnalyzesTo(analyzer, "ONe tHree", new String[]{
|
||||
"ONe", "on", "ONE", "tHree", "th", "THREE"
|
||||
}, new int[]{
|
||||
1, 0, 0, 1, 0, 0
|
||||
});
|
||||
assertAnalyzesTo(
|
||||
analyzer,
|
||||
"ONe tHree",
|
||||
new String[] { "ONe", "on", "ONE", "tHree", "th", "THREE" },
|
||||
new int[] { 1, 0, 0, 1, 0, 0 }
|
||||
);
|
||||
// Duplicates are removed
|
||||
assertAnalyzesTo(analyzer, "ONe THREE", new String[]{
|
||||
"ONe", "on", "ONE", "THREE", "th"
|
||||
}, new int[]{
|
||||
1, 0, 0, 1, 0, 0
|
||||
});
|
||||
assertAnalyzesTo(analyzer, "ONe THREE", new String[] { "ONe", "on", "ONE", "THREE", "th" }, new int[] { 1, 0, 0, 1, 0, 0 });
|
||||
}
|
||||
}
|
||||
|
||||
public void testMultiplexingNoOriginal() throws IOException {
|
||||
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
||||
Settings indexSettings = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("index.analysis.filter.t.type", "truncate")
|
||||
|
@ -102,16 +97,14 @@ public class MultiplexerTokenFilterTests extends OpenSearchTokenStreamTestCase {
|
|||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||
|
||||
IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
|
||||
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
|
||||
IndexAnalyzers indexAnalyzers = new AnalysisModule(
|
||||
TestEnvironment.newEnvironment(settings),
|
||||
Collections.singletonList(new CommonAnalysisPlugin())
|
||||
).getAnalysisRegistry().build(idxSettings);
|
||||
|
||||
try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) {
|
||||
assertNotNull(analyzer);
|
||||
assertAnalyzesTo(analyzer, "ONe tHree", new String[]{
|
||||
"on", "ONE", "th", "THREE"
|
||||
}, new int[]{
|
||||
1, 0, 1, 0,
|
||||
});
|
||||
assertAnalyzesTo(analyzer, "ONe tHree", new String[] { "on", "ONE", "th", "THREE" }, new int[] { 1, 0, 1, 0, });
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -52,10 +52,11 @@ public class NGramTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase
|
|||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_ngram.type", "ngram")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ngram");
|
||||
String source = "foo";
|
||||
String[] expected = new String[]{"f", "fo", "o", "oo", "o"};
|
||||
String[] expected = new String[] { "f", "fo", "o", "oo", "o" };
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
@ -68,10 +69,11 @@ public class NGramTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase
|
|||
.put("index.analysis.filter.my_ngram.type", "ngram")
|
||||
.put("index.analysis.filter.my_ngram.preserve_original", true)
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ngram");
|
||||
String source = "foo";
|
||||
String[] expected = new String[]{"f", "fo", "o", "oo", "o", "foo"};
|
||||
String[] expected = new String[] { "f", "fo", "o", "oo", "o", "foo" };
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
|
|
@ -61,30 +61,43 @@ public class NGramTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
final IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
|
||||
for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) {
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
|
||||
.put("token_chars", tokenChars).build();
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2)
|
||||
.put("max_gram", 3)
|
||||
.put("token_chars", tokenChars)
|
||||
.build();
|
||||
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
|
||||
// no exception
|
||||
}
|
||||
{
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
|
||||
.put("token_chars", "DIRECTIONALITY_UNDEFINED").build();
|
||||
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
|
||||
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create());
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2)
|
||||
.put("max_gram", 3)
|
||||
.put("token_chars", "DIRECTIONALITY_UNDEFINED")
|
||||
.build();
|
||||
IllegalArgumentException ex = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create()
|
||||
);
|
||||
assertEquals("Unknown token type: 'directionality_undefined'", ex.getMessage().substring(0, 46));
|
||||
assertTrue(ex.getMessage().contains("custom"));
|
||||
}
|
||||
{
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "custom")
|
||||
.put("custom_token_chars", "_-").build();
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2)
|
||||
.put("max_gram", 3)
|
||||
.put("token_chars", "custom")
|
||||
.put("custom_token_chars", "_-")
|
||||
.build();
|
||||
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
|
||||
// no exception
|
||||
}
|
||||
{
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "custom")
|
||||
.build();
|
||||
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
|
||||
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create());
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2)
|
||||
.put("max_gram", 3)
|
||||
.put("token_chars", "custom")
|
||||
.build();
|
||||
IllegalArgumentException ex = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create()
|
||||
);
|
||||
assertEquals("Token type: 'custom' requires setting `custom_token_chars`", ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
@ -94,12 +107,14 @@ public class NGramTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
final String name = "ngr";
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();
|
||||
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4)
|
||||
.putList("token_chars", new String[0]).build();
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2)
|
||||
.put("max_gram", 4)
|
||||
.putList("token_chars", new String[0])
|
||||
.build();
|
||||
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
|
||||
.create();
|
||||
tokenizer.setReader(new StringReader("1.34"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"1.", "1.3", "1.34", ".3", ".34", "34"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "1.", "1.3", "1.34", ".3", ".34", "34" });
|
||||
}
|
||||
|
||||
public void testCustomTokenChars() throws IOException {
|
||||
|
@ -107,12 +122,15 @@ public class NGramTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
final String name = "ngr";
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();
|
||||
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
|
||||
.putList("token_chars", "letter", "custom").put("custom_token_chars","_-").build();
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2)
|
||||
.put("max_gram", 3)
|
||||
.putList("token_chars", "letter", "custom")
|
||||
.put("custom_token_chars", "_-")
|
||||
.build();
|
||||
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
|
||||
.create();
|
||||
tokenizer.setReader(new StringReader("Abc -gh _jk =lm"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"Ab", "Abc", "bc", "-g", "-gh", "gh", "_j", "_jk", "jk", "lm"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "Ab", "Abc", "bc", "-g", "-gh", "gh", "_j", "_jk", "jk", "lm" });
|
||||
}
|
||||
|
||||
public void testPreTokenization() throws IOException {
|
||||
|
@ -120,19 +138,21 @@ public class NGramTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
final Index index = new Index("test", "_na_");
|
||||
final String name = "ngr";
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
|
||||
.put("token_chars", "letter,digit").build();
|
||||
Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
|
||||
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
|
||||
.create();
|
||||
tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"});
|
||||
settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
|
||||
.put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
|
||||
assertTokenStreamContents(
|
||||
tokenizer,
|
||||
new String[] { "Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f" }
|
||||
);
|
||||
settings = newAnalysisSettingsBuilder().put("min_gram", 2)
|
||||
.put("max_gram", 3)
|
||||
.put("token_chars", "letter,digit,punctuation,whitespace,symbol")
|
||||
.build();
|
||||
tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
|
||||
tokenizer.setReader(new StringReader(" a!$ 9"));
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { " a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9" });
|
||||
}
|
||||
|
||||
public void testPreTokenizationEdge() throws IOException {
|
||||
|
@ -141,18 +161,22 @@ public class NGramTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
final String name = "ngr";
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
|
||||
Tokenizer tokenizer =
|
||||
new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
|
||||
Tokenizer tokenizer = new EdgeNGramTokenizerFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
name,
|
||||
settings
|
||||
).create();
|
||||
tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"});
|
||||
settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
|
||||
.put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
|
||||
assertTokenStreamContents(tokenizer, new String[] { "Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f" });
|
||||
settings = newAnalysisSettingsBuilder().put("min_gram", 2)
|
||||
.put("max_gram", 3)
|
||||
.put("token_chars", "letter,digit,punctuation,whitespace,symbol")
|
||||
.build();
|
||||
tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
|
||||
.create();
|
||||
tokenizer.setReader(new StringReader(" a!$ 9"));
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[] {" a", " a!"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { " a", " a!" });
|
||||
}
|
||||
|
||||
public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception {
|
||||
|
@ -170,9 +194,12 @@ public class NGramTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
Settings indexSettings = newAnalysisSettingsBuilder().put(IndexMetadata.SETTING_VERSION_CREATED, v.id).build();
|
||||
Tokenizer tokenizer = new MockTokenizer();
|
||||
tokenizer.setReader(new StringReader("foo bar"));
|
||||
TokenStream edgeNGramTokenFilter =
|
||||
new EdgeNGramTokenFilterFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
|
||||
.create(tokenizer);
|
||||
TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
name,
|
||||
settings
|
||||
).create(tokenizer);
|
||||
if (reverse) {
|
||||
assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class));
|
||||
} else {
|
||||
|
@ -185,7 +212,7 @@ public class NGramTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
* test that throws an error when trying to get a NGramTokenizer where difference between max_gram and min_gram
|
||||
* is greater than the allowed value of max_ngram_diff
|
||||
*/
|
||||
public void testMaxNGramDiffException() throws Exception{
|
||||
public void testMaxNGramDiffException() throws Exception {
|
||||
final Index index = new Index("test", "_na_");
|
||||
final String name = "ngr";
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
|
@ -197,12 +224,19 @@ public class NGramTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {
|
|||
int max_gram = min_gram + ngramDiff;
|
||||
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", min_gram).put("max_gram", max_gram).build();
|
||||
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () ->
|
||||
new NGramTokenizerFactory(indexProperties, null, name, settings).create());
|
||||
IllegalArgumentException ex = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create()
|
||||
);
|
||||
assertEquals(
|
||||
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
|
||||
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
|
||||
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.",
|
||||
ex.getMessage());
|
||||
+ maxAllowedNgramDiff
|
||||
+ "] but was ["
|
||||
+ ngramDiff
|
||||
+ "]. This limit can be set by changing the ["
|
||||
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey()
|
||||
+ "] index level setting.",
|
||||
ex.getMessage()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,23 +52,20 @@ public class OpenSearchSolrSynonymParserTests extends OpenSearchTokenStreamTestC
|
|||
|
||||
public void testLenientParser() throws IOException, ParseException {
|
||||
OpenSearchSolrSynonymParser parser = new OpenSearchSolrSynonymParser(true, false, true, new StandardAnalyzer());
|
||||
String rules =
|
||||
"&,and\n" +
|
||||
"come,advance,approach\n";
|
||||
String rules = "&,and\n" + "come,advance,approach\n";
|
||||
StringReader rulesReader = new StringReader(rules);
|
||||
parser.parse(rulesReader);
|
||||
SynonymMap synonymMap = parser.build();
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader("approach quietly then advance & destroy"));
|
||||
TokenStream ts = new SynonymFilter(tokenizer, synonymMap, false);
|
||||
assertTokenStreamContents(ts, new String[]{"come", "quietly", "then", "come", "destroy"});
|
||||
assertTokenStreamContents(ts, new String[] { "come", "quietly", "then", "come", "destroy" });
|
||||
}
|
||||
|
||||
public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
|
||||
CharArraySet stopSet = new CharArraySet(1, true);
|
||||
stopSet.add("bar");
|
||||
OpenSearchSolrSynonymParser parser =
|
||||
new OpenSearchSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
|
||||
OpenSearchSolrSynonymParser parser = new OpenSearchSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
|
||||
String rules = "foo,bar,baz";
|
||||
StringReader rulesReader = new StringReader(rules);
|
||||
parser.parse(rulesReader);
|
||||
|
@ -76,14 +73,12 @@ public class OpenSearchSolrSynonymParserTests extends OpenSearchTokenStreamTestC
|
|||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
|
||||
TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
|
||||
assertTokenStreamContents(ts, new String[]{"first", "word", "is", "foo", "then", "and", "lastly", "foo"});
|
||||
assertTokenStreamContents(ts, new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
|
||||
}
|
||||
|
||||
public void testNonLenientParser() {
|
||||
OpenSearchSolrSynonymParser parser = new OpenSearchSolrSynonymParser(true, false, false, new StandardAnalyzer());
|
||||
String rules =
|
||||
"&,and=>and\n" +
|
||||
"come,advance,approach\n";
|
||||
String rules = "&,and=>and\n" + "come,advance,approach\n";
|
||||
StringReader rulesReader = new StringReader(rules);
|
||||
ParseException ex = expectThrows(ParseException.class, () -> parser.parse(rulesReader));
|
||||
assertThat(ex.getMessage(), containsString("Invalid synonym rule at line 1"));
|
||||
|
|
|
@ -52,47 +52,41 @@ public class OpenSearchWordnetSynonymParserTests extends OpenSearchTokenStreamTe
|
|||
|
||||
public void testLenientParser() throws IOException, ParseException {
|
||||
OpenSearchWordnetSynonymParser parser = new OpenSearchWordnetSynonymParser(true, false, true, new StandardAnalyzer());
|
||||
String rules =
|
||||
"s(100000001,1,'&',a,1,0).\n" +
|
||||
"s(100000001,2,'and',a,1,0).\n" +
|
||||
"s(100000002,1,'come',v,1,0).\n" +
|
||||
"s(100000002,2,'advance',v,1,0).\n" +
|
||||
"s(100000002,3,'approach',v,1,0).";
|
||||
String rules = "s(100000001,1,'&',a,1,0).\n"
|
||||
+ "s(100000001,2,'and',a,1,0).\n"
|
||||
+ "s(100000002,1,'come',v,1,0).\n"
|
||||
+ "s(100000002,2,'advance',v,1,0).\n"
|
||||
+ "s(100000002,3,'approach',v,1,0).";
|
||||
StringReader rulesReader = new StringReader(rules);
|
||||
parser.parse(rulesReader);
|
||||
SynonymMap synonymMap = parser.build();
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader("approach quietly then advance & destroy"));
|
||||
TokenStream ts = new SynonymFilter(tokenizer, synonymMap, false);
|
||||
assertTokenStreamContents(ts, new String[]{"come", "quietly", "then", "come", "destroy"});
|
||||
assertTokenStreamContents(ts, new String[] { "come", "quietly", "then", "come", "destroy" });
|
||||
}
|
||||
|
||||
public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
|
||||
CharArraySet stopSet = new CharArraySet(1, true);
|
||||
stopSet.add("bar");
|
||||
OpenSearchWordnetSynonymParser parser =
|
||||
new OpenSearchWordnetSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
|
||||
String rules =
|
||||
"s(100000001,1,'foo',v,1,0).\n" +
|
||||
"s(100000001,2,'bar',v,1,0).\n" +
|
||||
"s(100000001,3,'baz',v,1,0).";
|
||||
OpenSearchWordnetSynonymParser parser = new OpenSearchWordnetSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
|
||||
String rules = "s(100000001,1,'foo',v,1,0).\n" + "s(100000001,2,'bar',v,1,0).\n" + "s(100000001,3,'baz',v,1,0).";
|
||||
StringReader rulesReader = new StringReader(rules);
|
||||
parser.parse(rulesReader);
|
||||
SynonymMap synonymMap = parser.build();
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
|
||||
TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
|
||||
assertTokenStreamContents(ts, new String[]{"first", "word", "is", "foo", "then", "and", "lastly", "foo"});
|
||||
assertTokenStreamContents(ts, new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
|
||||
}
|
||||
|
||||
public void testNonLenientParser() {
|
||||
OpenSearchWordnetSynonymParser parser = new OpenSearchWordnetSynonymParser(true, false, false, new StandardAnalyzer());
|
||||
String rules =
|
||||
"s(100000001,1,'&',a,1,0).\n" +
|
||||
"s(100000001,2,'and',a,1,0).\n" +
|
||||
"s(100000002,1,'come',v,1,0).\n" +
|
||||
"s(100000002,2,'advance',v,1,0).\n" +
|
||||
"s(100000002,3,'approach',v,1,0).";
|
||||
String rules = "s(100000001,1,'&',a,1,0).\n"
|
||||
+ "s(100000001,2,'and',a,1,0).\n"
|
||||
+ "s(100000002,1,'come',v,1,0).\n"
|
||||
+ "s(100000002,2,'advance',v,1,0).\n"
|
||||
+ "s(100000002,3,'approach',v,1,0).";
|
||||
StringReader rulesReader = new StringReader(rules);
|
||||
ParseException ex = expectThrows(ParseException.class, () -> parser.parse(rulesReader));
|
||||
assertThat(ex.getMessage(), containsString("Invalid synonym rule at line 1"));
|
||||
|
|
|
@ -48,73 +48,105 @@ public class PathHierarchyTokenizerFactoryTests extends OpenSearchTokenStreamTes
|
|||
public void testDefaults() throws IOException {
|
||||
final Index index = new Index("test", "_na_");
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null,
|
||||
"path-hierarchy-tokenizer", Settings.EMPTY).create();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizerFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
"path-hierarchy-tokenizer",
|
||||
Settings.EMPTY
|
||||
).create();
|
||||
tokenizer.setReader(new StringReader("/one/two/three"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"/one", "/one/two", "/one/two/three"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "/one", "/one/two", "/one/two/three" });
|
||||
}
|
||||
|
||||
public void testReverse() throws IOException {
|
||||
final Index index = new Index("test", "_na_");
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
Settings settings = newAnalysisSettingsBuilder().put("reverse", true).build();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null,
|
||||
"path-hierarchy-tokenizer", settings).create();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizerFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
"path-hierarchy-tokenizer",
|
||||
settings
|
||||
).create();
|
||||
tokenizer.setReader(new StringReader("/one/two/three"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"/one/two/three", "one/two/three", "two/three", "three"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "/one/two/three", "one/two/three", "two/three", "three" });
|
||||
}
|
||||
|
||||
public void testDelimiter() throws IOException {
|
||||
final Index index = new Index("test", "_na_");
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
Settings settings = newAnalysisSettingsBuilder().put("delimiter", "-").build();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null,
|
||||
"path-hierarchy-tokenizer", settings).create();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizerFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
"path-hierarchy-tokenizer",
|
||||
settings
|
||||
).create();
|
||||
tokenizer.setReader(new StringReader("/one/two/three"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"/one/two/three"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "/one/two/three" });
|
||||
tokenizer.setReader(new StringReader("one-two-three"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"one", "one-two", "one-two-three"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "one", "one-two", "one-two-three" });
|
||||
}
|
||||
|
||||
public void testReplace() throws IOException {
|
||||
final Index index = new Index("test", "_na_");
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
Settings settings = newAnalysisSettingsBuilder().put("replacement", "-").build();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null,
|
||||
"path-hierarchy-tokenizer", settings).create();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizerFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
"path-hierarchy-tokenizer",
|
||||
settings
|
||||
).create();
|
||||
tokenizer.setReader(new StringReader("/one/two/three"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"-one", "-one-two", "-one-two-three"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "-one", "-one-two", "-one-two-three" });
|
||||
tokenizer.setReader(new StringReader("one-two-three"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"one-two-three"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "one-two-three" });
|
||||
}
|
||||
|
||||
public void testSkip() throws IOException {
|
||||
final Index index = new Index("test", "_na_");
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
Settings settings = newAnalysisSettingsBuilder().put("skip", 2).build();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null,
|
||||
"path-hierarchy-tokenizer", settings).create();
|
||||
Tokenizer tokenizer = new PathHierarchyTokenizerFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
"path-hierarchy-tokenizer",
|
||||
settings
|
||||
).create();
|
||||
tokenizer.setReader(new StringReader("/one/two/three/four/five"));
|
||||
assertTokenStreamContents(tokenizer, new String[] {"/three", "/three/four", "/three/four/five"});
|
||||
assertTokenStreamContents(tokenizer, new String[] { "/three", "/three/four", "/three/four/five" });
|
||||
}
|
||||
|
||||
public void testDelimiterExceptions() {
|
||||
final Index index = new Index("test", "_na_");
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
{
|
||||
String delimiter = RandomPicks.randomFrom(random(), new String[] {"--", ""});
|
||||
String delimiter = RandomPicks.randomFrom(random(), new String[] { "--", "" });
|
||||
Settings settings = newAnalysisSettingsBuilder().put("delimiter", delimiter).build();
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> new PathHierarchyTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null,
|
||||
"path-hierarchy-tokenizer", settings).create());
|
||||
IllegalArgumentException e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> new PathHierarchyTokenizerFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
"path-hierarchy-tokenizer",
|
||||
settings
|
||||
).create()
|
||||
);
|
||||
assertEquals("delimiter must be a one char value", e.getMessage());
|
||||
}
|
||||
{
|
||||
String replacement = RandomPicks.randomFrom(random(), new String[] {"--", ""});
|
||||
String replacement = RandomPicks.randomFrom(random(), new String[] { "--", "" });
|
||||
Settings settings = newAnalysisSettingsBuilder().put("replacement", replacement).build();
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> new PathHierarchyTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null,
|
||||
"path-hierarchy-tokenizer", settings).create());
|
||||
IllegalArgumentException e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> new PathHierarchyTokenizerFactory(
|
||||
IndexSettingsModule.newIndexSettings(index, indexSettings),
|
||||
null,
|
||||
"path-hierarchy-tokenizer",
|
||||
settings
|
||||
).create()
|
||||
);
|
||||
assertEquals("replacement must be a one char value", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,90 +46,96 @@ import java.util.regex.Pattern;
|
|||
*/
|
||||
public class PatternAnalyzerTests extends OpenSearchTokenStreamTestCase {
|
||||
|
||||
/**
|
||||
* Test PatternAnalyzer when it is configured with a non-word pattern.
|
||||
*/
|
||||
public void testNonWordPattern() throws IOException {
|
||||
// Split on non-letter pattern, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\W+"), false, null);
|
||||
assertAnalyzesTo(a, "The quick brown Fox,the abcd1234 (56.78) dc.",
|
||||
new String[] { "The", "quick", "brown", "Fox", "the", "abcd1234", "56", "78", "dc" });
|
||||
/**
|
||||
* Test PatternAnalyzer when it is configured with a non-word pattern.
|
||||
*/
|
||||
public void testNonWordPattern() throws IOException {
|
||||
// Split on non-letter pattern, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\W+"), false, null);
|
||||
assertAnalyzesTo(
|
||||
a,
|
||||
"The quick brown Fox,the abcd1234 (56.78) dc.",
|
||||
new String[] { "The", "quick", "brown", "Fox", "the", "abcd1234", "56", "78", "dc" }
|
||||
);
|
||||
|
||||
// split on non-letter pattern, lowercase, english stopwords
|
||||
PatternAnalyzer b = new PatternAnalyzer(Pattern.compile("\\W+"), true,
|
||||
EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
assertAnalyzesTo(b, "The quick brown Fox,the abcd1234 (56.78) dc.",
|
||||
new String[] { "quick", "brown", "fox", "abcd1234", "56", "78", "dc" });
|
||||
}
|
||||
// split on non-letter pattern, lowercase, english stopwords
|
||||
PatternAnalyzer b = new PatternAnalyzer(Pattern.compile("\\W+"), true, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
assertAnalyzesTo(
|
||||
b,
|
||||
"The quick brown Fox,the abcd1234 (56.78) dc.",
|
||||
new String[] { "quick", "brown", "fox", "abcd1234", "56", "78", "dc" }
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test PatternAnalyzer when it is configured with a whitespace pattern.
|
||||
* Behavior can be similar to WhitespaceAnalyzer (depending upon options)
|
||||
*/
|
||||
public void testWhitespacePattern() throws IOException {
|
||||
// Split on whitespace patterns, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\s+"), false, null);
|
||||
assertAnalyzesTo(a, "The quick brown Fox,the abcd1234 (56.78) dc.",
|
||||
new String[] { "The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });
|
||||
/**
|
||||
* Test PatternAnalyzer when it is configured with a whitespace pattern.
|
||||
* Behavior can be similar to WhitespaceAnalyzer (depending upon options)
|
||||
*/
|
||||
public void testWhitespacePattern() throws IOException {
|
||||
// Split on whitespace patterns, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\s+"), false, null);
|
||||
assertAnalyzesTo(
|
||||
a,
|
||||
"The quick brown Fox,the abcd1234 (56.78) dc.",
|
||||
new String[] { "The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." }
|
||||
);
|
||||
|
||||
// Split on whitespace patterns, lowercase, english stopwords
|
||||
PatternAnalyzer b = new PatternAnalyzer(Pattern.compile("\\s+"), true,
|
||||
EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
assertAnalyzesTo(b, "The quick brown Fox,the abcd1234 (56.78) dc.",
|
||||
new String[] { "quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." });
|
||||
}
|
||||
// Split on whitespace patterns, lowercase, english stopwords
|
||||
PatternAnalyzer b = new PatternAnalyzer(Pattern.compile("\\s+"), true, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
assertAnalyzesTo(
|
||||
b,
|
||||
"The quick brown Fox,the abcd1234 (56.78) dc.",
|
||||
new String[] { "quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." }
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test PatternAnalyzer when it is configured with a custom pattern. In this
|
||||
* case, text is tokenized on the comma ","
|
||||
*/
|
||||
public void testCustomPattern() throws IOException {
|
||||
// Split on comma, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(Pattern.compile(","), false, null);
|
||||
assertAnalyzesTo(a, "Here,Are,some,Comma,separated,words,",
|
||||
new String[] { "Here", "Are", "some", "Comma", "separated", "words" });
|
||||
/**
|
||||
* Test PatternAnalyzer when it is configured with a custom pattern. In this
|
||||
* case, text is tokenized on the comma ","
|
||||
*/
|
||||
public void testCustomPattern() throws IOException {
|
||||
// Split on comma, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(Pattern.compile(","), false, null);
|
||||
assertAnalyzesTo(a, "Here,Are,some,Comma,separated,words,", new String[] { "Here", "Are", "some", "Comma", "separated", "words" });
|
||||
|
||||
// split on comma, lowercase, english stopwords
|
||||
PatternAnalyzer b = new PatternAnalyzer(Pattern.compile(","), true,
|
||||
EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
assertAnalyzesTo(b, "Here,Are,some,Comma,separated,words,",
|
||||
new String[] { "here", "some", "comma", "separated", "words" });
|
||||
}
|
||||
// split on comma, lowercase, english stopwords
|
||||
PatternAnalyzer b = new PatternAnalyzer(Pattern.compile(","), true, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
assertAnalyzesTo(b, "Here,Are,some,Comma,separated,words,", new String[] { "here", "some", "comma", "separated", "words" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test PatternAnalyzer against a large document.
|
||||
*/
|
||||
public void testHugeDocument() throws IOException {
|
||||
StringBuilder document = new StringBuilder();
|
||||
// 5000 a's
|
||||
char largeWord[] = new char[5000];
|
||||
Arrays.fill(largeWord, 'a');
|
||||
document.append(largeWord);
|
||||
/**
|
||||
* Test PatternAnalyzer against a large document.
|
||||
*/
|
||||
public void testHugeDocument() throws IOException {
|
||||
StringBuilder document = new StringBuilder();
|
||||
// 5000 a's
|
||||
char largeWord[] = new char[5000];
|
||||
Arrays.fill(largeWord, 'a');
|
||||
document.append(largeWord);
|
||||
|
||||
// a space
|
||||
document.append(' ');
|
||||
// a space
|
||||
document.append(' ');
|
||||
|
||||
// 2000 b's
|
||||
char largeWord2[] = new char[2000];
|
||||
Arrays.fill(largeWord2, 'b');
|
||||
document.append(largeWord2);
|
||||
// 2000 b's
|
||||
char largeWord2[] = new char[2000];
|
||||
Arrays.fill(largeWord2, 'b');
|
||||
document.append(largeWord2);
|
||||
|
||||
// Split on whitespace patterns, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\s+"), false, null);
|
||||
assertAnalyzesTo(a, document.toString(),
|
||||
new String[] { new String(largeWord), new String(largeWord2) });
|
||||
}
|
||||
// Split on whitespace patterns, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\s+"), false, null);
|
||||
assertAnalyzesTo(a, document.toString(), new String[] { new String(largeWord), new String(largeWord2) });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new PatternAnalyzer(Pattern.compile(","), true, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new PatternAnalyzer(Pattern.compile(","), true, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
checkRandomData(random(), a, 10000 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testNormalize() {
|
||||
PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\s+"), false, null);
|
||||
assertEquals(new BytesRef("FooBar"), a.normalize("dummy", "FooBar"));
|
||||
a = new PatternAnalyzer(Pattern.compile("\\s+"), true, null);
|
||||
assertEquals(new BytesRef("foobar"), a.normalize("dummy", "FooBar"));
|
||||
}
|
||||
public void testNormalize() {
|
||||
PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\s+"), false, null);
|
||||
assertEquals(new BytesRef("FooBar"), a.normalize("dummy", "FooBar"));
|
||||
a = new PatternAnalyzer(Pattern.compile("\\s+"), true, null);
|
||||
assertEquals(new BytesRef("foobar"), a.normalize("dummy", "FooBar"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,31 +49,35 @@ public class PatternCaptureTokenFilterTests extends OpenSearchTokenStreamTestCas
|
|||
public void testPatternCaptureTokenFilter() throws Exception {
|
||||
String json = "/org/opensearch/analysis/common/pattern_capture.json";
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.build();
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.build();
|
||||
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
IndexAnalyzers indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
NamedAnalyzer analyzer1 = indexAnalyzers.get("single");
|
||||
|
||||
assertTokenStreamContents(analyzer1.tokenStream("test", "foobarbaz"), new String[]{"foobarbaz","foobar","foo"});
|
||||
assertTokenStreamContents(analyzer1.tokenStream("test", "foobarbaz"), new String[] { "foobarbaz", "foobar", "foo" });
|
||||
|
||||
NamedAnalyzer analyzer2 = indexAnalyzers.get("multi");
|
||||
|
||||
assertTokenStreamContents(analyzer2.tokenStream("test", "abc123def"), new String[]{"abc123def","abc","123","def"});
|
||||
assertTokenStreamContents(analyzer2.tokenStream("test", "abc123def"), new String[] { "abc123def", "abc", "123", "def" });
|
||||
|
||||
NamedAnalyzer analyzer3 = indexAnalyzers.get("preserve");
|
||||
|
||||
assertTokenStreamContents(analyzer3.tokenStream("test", "foobarbaz"), new String[]{"foobar","foo"});
|
||||
assertTokenStreamContents(analyzer3.tokenStream("test", "foobarbaz"), new String[] { "foobar", "foo" });
|
||||
}
|
||||
|
||||
public void testNoPatterns() {
|
||||
try {
|
||||
new PatternCaptureGroupTokenFilterFactory(IndexSettingsModule.newIndexSettings("test", Settings.EMPTY), null,
|
||||
"pattern_capture", Settings.builder().put("pattern", "foobar").build());
|
||||
fail ("Expected IllegalArgumentException");
|
||||
new PatternCaptureGroupTokenFilterFactory(
|
||||
IndexSettingsModule.newIndexSettings("test", Settings.EMPTY),
|
||||
null,
|
||||
"pattern_capture",
|
||||
Settings.builder().put("pattern", "foobar").build()
|
||||
);
|
||||
fail("Expected IllegalArgumentException");
|
||||
} catch (IllegalArgumentException e) {
|
||||
assertThat(e.getMessage(), containsString("required setting 'patterns' is missing"));
|
||||
}
|
||||
|
|
|
@ -53,9 +53,7 @@ import java.util.Collections;
|
|||
public class PredicateTokenScriptFilterTests extends OpenSearchTokenStreamTestCase {
|
||||
|
||||
public void testSimpleFilter() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
||||
Settings indexSettings = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("index.analysis.filter.f.type", "predicate_token_filter")
|
||||
|
@ -74,7 +72,7 @@ public class PredicateTokenScriptFilterTests extends OpenSearchTokenStreamTestCa
|
|||
};
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
ScriptService scriptService = new ScriptService(indexSettings, Collections.emptyMap(), Collections.emptyMap()){
|
||||
ScriptService scriptService = new ScriptService(indexSettings, Collections.emptyMap(), Collections.emptyMap()) {
|
||||
@Override
|
||||
public <FactoryType> FactoryType compile(Script script, ScriptContext<FactoryType> context) {
|
||||
assertEquals(context, AnalysisPredicateScript.CONTEXT);
|
||||
|
@ -85,16 +83,13 @@ public class PredicateTokenScriptFilterTests extends OpenSearchTokenStreamTestCa
|
|||
|
||||
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
|
||||
plugin.createComponents(null, null, null, null, scriptService, null, null, null, null, null, null);
|
||||
AnalysisModule module
|
||||
= new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(plugin));
|
||||
AnalysisModule module = new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(plugin));
|
||||
|
||||
IndexAnalyzers analyzers = module.getAnalysisRegistry().build(idxSettings);
|
||||
|
||||
try (NamedAnalyzer analyzer = analyzers.get("myAnalyzer")) {
|
||||
assertNotNull(analyzer);
|
||||
assertAnalyzesTo(analyzer, "Oh what a wonderful thing to be", new String[]{
|
||||
"Oh", "what", "to", "be"
|
||||
});
|
||||
assertAnalyzesTo(analyzer, "Oh what a wonderful thing to be", new String[] { "Oh", "what", "to", "be" });
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -64,11 +64,7 @@ public class RemoveDuplicatesFilterFactoryTests extends OpenSearchTokenStreamTes
|
|||
new Token("d", 1, 4, 5)
|
||||
);
|
||||
|
||||
assertTokenStreamContents(tokenFilter.create(cts), new String[]{
|
||||
"a", "b", "c", "d"
|
||||
}, new int[]{
|
||||
1, 1, 0, 1
|
||||
});
|
||||
assertTokenStreamContents(tokenFilter.create(cts), new String[] { "a", "b", "c", "d" }, new int[] { 1, 1, 0, 1 });
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -52,9 +52,7 @@ import java.util.Collections;
|
|||
public class ScriptedConditionTokenFilterTests extends OpenSearchTokenStreamTestCase {
|
||||
|
||||
public void testSimpleCondition() throws Exception {
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
||||
Settings indexSettings = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("index.analysis.filter.cond.type", "condition")
|
||||
|
@ -74,7 +72,7 @@ public class ScriptedConditionTokenFilterTests extends OpenSearchTokenStreamTest
|
|||
};
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
ScriptService scriptService = new ScriptService(indexSettings, Collections.emptyMap(), Collections.emptyMap()){
|
||||
ScriptService scriptService = new ScriptService(indexSettings, Collections.emptyMap(), Collections.emptyMap()) {
|
||||
@Override
|
||||
public <FactoryType> FactoryType compile(Script script, ScriptContext<FactoryType> context) {
|
||||
assertEquals(context, AnalysisPredicateScript.CONTEXT);
|
||||
|
@ -85,16 +83,13 @@ public class ScriptedConditionTokenFilterTests extends OpenSearchTokenStreamTest
|
|||
|
||||
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
|
||||
plugin.createComponents(null, null, null, null, scriptService, null, null, null, null, null, null);
|
||||
AnalysisModule module
|
||||
= new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(plugin));
|
||||
AnalysisModule module = new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(plugin));
|
||||
|
||||
IndexAnalyzers analyzers = module.getAnalysisRegistry().build(idxSettings);
|
||||
|
||||
try (NamedAnalyzer analyzer = analyzers.get("myAnalyzer")) {
|
||||
assertNotNull(analyzer);
|
||||
assertAnalyzesTo(analyzer, "Vorsprung Durch Technik", new String[]{
|
||||
"Vorsprung", "Durch", "TECHNIK"
|
||||
});
|
||||
assertAnalyzesTo(analyzer, "Vorsprung Durch Technik", new String[] { "Vorsprung", "Durch", "TECHNIK" });
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -52,7 +52,8 @@ public class ShingleTokenFilterTests extends OpenSearchTokenStreamTestCase {
|
|||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader("this is a test"));
|
||||
|
|
|
@ -38,35 +38,29 @@ import org.opensearch.test.OpenSearchTokenStreamTestCase;
|
|||
|
||||
public class SnowballAnalyzerTests extends OpenSearchTokenStreamTestCase {
|
||||
|
||||
public void testEnglish() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer("English");
|
||||
assertAnalyzesTo(a, "he abhorred accents",
|
||||
new String[]{"he", "abhor", "accent"});
|
||||
}
|
||||
public void testEnglish() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer("English");
|
||||
assertAnalyzesTo(a, "he abhorred accents", new String[] { "he", "abhor", "accent" });
|
||||
}
|
||||
|
||||
public void testStopwords() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer("English",
|
||||
EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
assertAnalyzesTo(a, "the quick brown fox jumped",
|
||||
new String[]{"quick", "brown", "fox", "jump"});
|
||||
}
|
||||
public void testStopwords() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer("English", EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
assertAnalyzesTo(a, "the quick brown fox jumped", new String[] { "quick", "brown", "fox", "jump" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test turkish lowercasing
|
||||
*/
|
||||
public void testTurkish() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer("Turkish");
|
||||
/**
|
||||
* Test turkish lowercasing
|
||||
*/
|
||||
public void testTurkish() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer("Turkish");
|
||||
|
||||
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
|
||||
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
|
||||
}
|
||||
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
|
||||
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
|
||||
}
|
||||
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer("English");
|
||||
assertAnalyzesTo(a, "he abhorred accents",
|
||||
new String[]{"he", "abhor", "accent"});
|
||||
assertAnalyzesTo(a, "she abhorred him",
|
||||
new String[]{"she", "abhor", "him"});
|
||||
}
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer("English");
|
||||
assertAnalyzesTo(a, "he abhorred accents", new String[] { "he", "abhor", "accent" });
|
||||
assertAnalyzesTo(a, "she abhorred him", new String[] { "she", "abhor", "him" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -59,7 +59,8 @@ public class StemmerOverrideTokenFilterFactoryTests extends OpenSearchTokenStrea
|
|||
.putList("index.analysis.filter.my_stemmer_override.rules", rules)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
new CommonAnalysisPlugin()
|
||||
);
|
||||
|
||||
return analysis.tokenFilter.get("my_stemmer_override");
|
||||
}
|
||||
|
@ -75,19 +76,18 @@ public class StemmerOverrideTokenFilterFactoryTests extends OpenSearchTokenStrea
|
|||
"=>a", // no keys
|
||||
"a,=>b" // empty key
|
||||
)) {
|
||||
expectThrows(RuntimeException.class, String.format(
|
||||
Locale.ROOT, "Should fail for invalid rule: '%s'", rule
|
||||
), () -> create(rule));
|
||||
expectThrows(
|
||||
RuntimeException.class,
|
||||
String.format(Locale.ROOT, "Should fail for invalid rule: '%s'", rule),
|
||||
() -> create(rule)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public void testRulesOk() throws IOException {
|
||||
TokenFilterFactory tokenFilterFactory = create(
|
||||
"a => 1",
|
||||
"b,c => 2"
|
||||
);
|
||||
TokenFilterFactory tokenFilterFactory = create("a => 1", "b,c => 2");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader("a b c"));
|
||||
assertTokenStreamContents(tokenFilterFactory.create(tokenizer), new String[]{"1", "2", "2"});
|
||||
assertTokenStreamContents(tokenFilterFactory.create(tokenizer), new String[] { "1", "2", "2" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -63,13 +63,13 @@ public class StemmerTokenFilterFactoryTests extends OpenSearchTokenStreamTestCas
|
|||
for (int i = 0; i < iters; i++) {
|
||||
Version v = VersionUtils.randomVersion(random());
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.my_english.type", "stemmer")
|
||||
.put("index.analysis.filter.my_english.language", "english")
|
||||
.put("index.analysis.analyzer.my_english.tokenizer","whitespace")
|
||||
.put("index.analysis.analyzer.my_english.filter","my_english")
|
||||
.put(SETTING_VERSION_CREATED,v)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
.put("index.analysis.filter.my_english.type", "stemmer")
|
||||
.put("index.analysis.filter.my_english.language", "english")
|
||||
.put("index.analysis.analyzer.my_english.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.my_english.filter", "my_english")
|
||||
.put(SETTING_VERSION_CREATED, v)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english");
|
||||
|
@ -80,7 +80,7 @@ public class StemmerTokenFilterFactoryTests extends OpenSearchTokenStreamTestCas
|
|||
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
|
||||
NamedAnalyzer analyzer = indexAnalyzers.get("my_english");
|
||||
assertThat(create, instanceOf(PorterStemFilter.class));
|
||||
assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"});
|
||||
assertAnalyzesTo(analyzer, "consolingly", new String[] { "consolingli" });
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -90,13 +90,13 @@ public class StemmerTokenFilterFactoryTests extends OpenSearchTokenStreamTestCas
|
|||
|
||||
Version v = VersionUtils.randomVersion(random());
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.my_porter2.type", "stemmer")
|
||||
.put("index.analysis.filter.my_porter2.language", "porter2")
|
||||
.put("index.analysis.analyzer.my_porter2.tokenizer","whitespace")
|
||||
.put("index.analysis.analyzer.my_porter2.filter","my_porter2")
|
||||
.put(SETTING_VERSION_CREATED,v)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
.put("index.analysis.filter.my_porter2.type", "stemmer")
|
||||
.put("index.analysis.filter.my_porter2.language", "porter2")
|
||||
.put("index.analysis.analyzer.my_porter2.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.my_porter2.filter", "my_porter2")
|
||||
.put(SETTING_VERSION_CREATED, v)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
|
||||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_porter2");
|
||||
|
@ -107,18 +107,23 @@ public class StemmerTokenFilterFactoryTests extends OpenSearchTokenStreamTestCas
|
|||
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
|
||||
NamedAnalyzer analyzer = indexAnalyzers.get("my_porter2");
|
||||
assertThat(create, instanceOf(SnowballFilter.class));
|
||||
assertAnalyzesTo(analyzer, "possibly", new String[]{"possibl"});
|
||||
assertAnalyzesTo(analyzer, "possibly", new String[] { "possibl" });
|
||||
}
|
||||
}
|
||||
|
||||
public void testMultipleLanguagesThrowsException() throws IOException {
|
||||
Version v = VersionUtils.randomVersion(random());
|
||||
Settings settings = Settings.builder().put("index.analysis.filter.my_english.type", "stemmer")
|
||||
.putList("index.analysis.filter.my_english.language", "english", "light_english").put(SETTING_VERSION_CREATED, v)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
||||
Settings settings = Settings.builder()
|
||||
.put("index.analysis.filter.my_english.type", "stemmer")
|
||||
.putList("index.analysis.filter.my_english.language", "english", "light_english")
|
||||
.put(SETTING_VERSION_CREATED, v)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN));
|
||||
IllegalArgumentException e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN)
|
||||
);
|
||||
assertEquals("Invalid stemmer class specified: [english, light_english]", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -80,10 +80,11 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
Files.copy(synonymsWordnet, config.resolve("synonyms_wordnet.txt"));
|
||||
|
||||
String json = "/org/opensearch/analysis/common/synonyms.json";
|
||||
Settings settings = Settings.builder().
|
||||
loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), home)
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).build();
|
||||
Settings settings = Settings.builder()
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), home)
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.build();
|
||||
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
|
@ -110,7 +111,7 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
.put("index.analysis.filter.stop_within_synonym.type", "stop")
|
||||
.putList("index.analysis.filter.stop_within_synonym.stopwords", "foobar", "opensearch")
|
||||
.put("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.tokenizer", "whitespace")
|
||||
.putList("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.filter", "stop_within_synonym","synonym")
|
||||
.putList("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.filter", "stop_within_synonym", "synonym")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
try {
|
||||
|
@ -131,7 +132,7 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
.put("index.analysis.filter.stop_within_synonym.type", "stop")
|
||||
.putList("index.analysis.filter.stop_within_synonym.stopwords", "foobar", "opensearch")
|
||||
.put("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.tokenizer", "whitespace")
|
||||
.putList("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.filter", "stop_within_synonym","synonym_expand")
|
||||
.putList("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.filter", "stop_within_synonym", "synonym_expand")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
try {
|
||||
|
@ -159,9 +160,12 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "Some developers are odd",
|
||||
new String[]{ "some", "developers", "develop", "programm", "are", "odd" },
|
||||
new int[]{ 1, 1, 0, 0, 1, 1 });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(
|
||||
indexAnalyzers.get("synonymAnalyzer"),
|
||||
"Some developers are odd",
|
||||
new String[] { "some", "developers", "develop", "programm", "are", "odd" },
|
||||
new int[] { 1, 1, 0, 0, 1, 1 }
|
||||
);
|
||||
}
|
||||
|
||||
public void testAsciiFoldingFilterForSynonyms() throws IOException {
|
||||
|
@ -176,9 +180,12 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "høj",
|
||||
new String[]{ "hoj", "height" },
|
||||
new int[]{ 1, 0 });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(
|
||||
indexAnalyzers.get("synonymAnalyzer"),
|
||||
"høj",
|
||||
new String[] { "hoj", "height" },
|
||||
new int[] { 1, 0 }
|
||||
);
|
||||
}
|
||||
|
||||
public void testPreconfigured() throws IOException {
|
||||
|
@ -193,9 +200,12 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("my_analyzer"), "würst",
|
||||
new String[]{ "wurst", "sausage"},
|
||||
new int[]{ 1, 0 });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(
|
||||
indexAnalyzers.get("my_analyzer"),
|
||||
"würst",
|
||||
new String[] { "wurst", "sausage" },
|
||||
new int[] { 1, 0 }
|
||||
);
|
||||
}
|
||||
|
||||
public void testChainedSynonymFilters() throws IOException {
|
||||
|
@ -212,15 +222,21 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("syn"), "term1",
|
||||
new String[]{ "term1", "term3", "term2" }, new int[]{ 1, 0, 0 });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(
|
||||
indexAnalyzers.get("syn"),
|
||||
"term1",
|
||||
new String[] { "term1", "term3", "term2" },
|
||||
new int[] { 1, 0, 0 }
|
||||
);
|
||||
}
|
||||
|
||||
public void testShingleFilters() {
|
||||
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT))
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT)
|
||||
)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.put("index.analysis.filter.synonyms.type", "synonym")
|
||||
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
|
||||
|
@ -230,9 +246,10 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
|
||||
expectThrows(IllegalArgumentException.class, () -> {
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
});
|
||||
expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> { indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; }
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
@ -246,9 +263,7 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
|
||||
String[] bypassingFactories = new String[]{
|
||||
"dictionary_decompounder"
|
||||
};
|
||||
String[] bypassingFactories = new String[] { "dictionary_decompounder" };
|
||||
|
||||
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
|
||||
for (String factory : bypassingFactories) {
|
||||
|
@ -265,14 +280,25 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
}
|
||||
|
||||
public void testPreconfiguredTokenFilters() throws IOException {
|
||||
Set<String> disallowedFilters = new HashSet<>(Arrays.asList(
|
||||
"common_grams", "edge_ngram", "edgeNGram", "keyword_repeat", "ngram", "nGram",
|
||||
"shingle", "word_delimiter", "word_delimiter_graph"
|
||||
));
|
||||
Set<String> disallowedFilters = new HashSet<>(
|
||||
Arrays.asList(
|
||||
"common_grams",
|
||||
"edge_ngram",
|
||||
"edgeNGram",
|
||||
"keyword_repeat",
|
||||
"ngram",
|
||||
"nGram",
|
||||
"shingle",
|
||||
"word_delimiter",
|
||||
"word_delimiter_graph"
|
||||
)
|
||||
);
|
||||
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT))
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT)
|
||||
)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
|
@ -281,23 +307,26 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
|
||||
for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) {
|
||||
if (disallowedFilters.contains(tf.getName())) {
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
"Expected exception for factory " + tf.getName(), () -> {
|
||||
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
|
||||
});
|
||||
assertEquals(tf.getName(), "Token filter [" + tf.getName()
|
||||
+ "] cannot be used to parse synonyms",
|
||||
e.getMessage());
|
||||
}
|
||||
else {
|
||||
IllegalArgumentException e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
"Expected exception for factory " + tf.getName(),
|
||||
() -> { tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter(); }
|
||||
);
|
||||
assertEquals(tf.getName(), "Token filter [" + tf.getName() + "] cannot be used to parse synonyms", e.getMessage());
|
||||
} else {
|
||||
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
|
||||
}
|
||||
}
|
||||
|
||||
Settings settings2 = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(
|
||||
random(), LegacyESVersion.V_6_0_0, VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)))
|
||||
random(),
|
||||
LegacyESVersion.V_6_0_0,
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)
|
||||
)
|
||||
)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.putList("common_words", "a", "b")
|
||||
.put("output_unigrams", "true")
|
||||
|
@ -309,8 +338,7 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
if (disallowedFilters.contains(tf.getName())) {
|
||||
tf.get(idxSettings2, null, tf.getName(), settings2).getSynonymFilter();
|
||||
expectedWarnings.add("Token filter [" + tf.getName() + "] will not be usable to parse synonyms after v7.0");
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
tf.get(idxSettings2, null, tf.getName(), settings2).getSynonymFilter();
|
||||
}
|
||||
}
|
||||
|
@ -320,8 +348,10 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
public void testDisallowedTokenFilters() throws IOException {
|
||||
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT))
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT)
|
||||
)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.putList("common_words", "a", "b")
|
||||
.put("output_unigrams", "true")
|
||||
|
@ -329,29 +359,39 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
|
||||
|
||||
String[] disallowedFactories = new String[]{
|
||||
"multiplexer", "cjk_bigram", "common_grams", "ngram", "edge_ngram",
|
||||
"word_delimiter", "word_delimiter_graph", "fingerprint"
|
||||
};
|
||||
String[] disallowedFactories = new String[] {
|
||||
"multiplexer",
|
||||
"cjk_bigram",
|
||||
"common_grams",
|
||||
"ngram",
|
||||
"edge_ngram",
|
||||
"word_delimiter",
|
||||
"word_delimiter_graph",
|
||||
"fingerprint" };
|
||||
|
||||
for (String factory : disallowedFactories) {
|
||||
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
|
||||
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
|
||||
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
||||
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
IllegalArgumentException e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
"Expected IllegalArgumentException for factory " + factory,
|
||||
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null));
|
||||
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null)
|
||||
);
|
||||
|
||||
assertEquals(factory, "Token filter [" + factory
|
||||
+ "] cannot be used to parse synonyms",
|
||||
e.getMessage());
|
||||
assertEquals(factory, "Token filter [" + factory + "] cannot be used to parse synonyms", e.getMessage());
|
||||
}
|
||||
|
||||
settings = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(
|
||||
random(), LegacyESVersion.V_6_0_0, VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)))
|
||||
random(),
|
||||
LegacyESVersion.V_6_0_0,
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)
|
||||
)
|
||||
)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.putList("common_words", "a", "b")
|
||||
.put("output_unigrams", "true")
|
||||
|
@ -365,16 +405,20 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
||||
|
||||
stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
|
||||
expectedWarnings.add("Token filter [" + factory
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
expectedWarnings.add("Token filter [" + factory + "] will not be usable to parse synonyms after v7.0");
|
||||
}
|
||||
|
||||
assertWarnings(expectedWarnings.toArray(new String[0]));
|
||||
|
||||
settings = Settings.builder()
|
||||
.put(IndexMetadata.SETTING_VERSION_CREATED,
|
||||
.put(
|
||||
IndexMetadata.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(
|
||||
random(), LegacyESVersion.V_6_0_0, VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)))
|
||||
random(),
|
||||
LegacyESVersion.V_6_0_0,
|
||||
VersionUtils.getPreviousVersion(LegacyESVersion.V_7_0_0)
|
||||
)
|
||||
)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.put("preserve_original", "false")
|
||||
.build();
|
||||
|
@ -383,11 +427,12 @@ public class SynonymsAnalysisTests extends OpenSearchTestCase {
|
|||
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
|
||||
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
||||
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null));
|
||||
IllegalArgumentException e = expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null)
|
||||
);
|
||||
|
||||
assertEquals("Token filter [multiplexer] cannot be used to parse synonyms unless [preserve_original] is [true]",
|
||||
e.getMessage());
|
||||
assertEquals("Token filter [multiplexer] cannot be used to parse synonyms unless [preserve_original] is [true]", e.getMessage());
|
||||
|
||||
}
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue