migrate branch for analysis-icu
This commit is contained in:
commit
f3228e394d
|
@ -0,0 +1,290 @@
|
|||
ICU Analysis for Elasticsearch
|
||||
==================================
|
||||
|
||||
The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components.
|
||||
|
||||
In order to install the plugin, simply run:
|
||||
|
||||
```sh
|
||||
bin/plugin install elasticsearch/elasticsearch-analysis-icu/2.5.0
|
||||
```
|
||||
|
||||
You need to install a version matching your Elasticsearch version:
|
||||
|
||||
| elasticsearch | ICU Analysis Plugin | Docs |
|
||||
|---------------|-----------------------|------------|
|
||||
| master | Build from source | See below |
|
||||
| es-1.x | Build from source | [2.6.0-SNAPSHOT](https://github.com/elastic/elasticsearch-analysis-icu/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x) |
|
||||
| es-1.5 | 2.5.0 | [2.5.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.5.0/#version-250-for-elasticsearch-15) |
|
||||
| es-1.4 | 2.4.3 | [2.4.3](https://github.com/elasticsearch/elasticsearch-analysis-icu/tree/v2.4.3/#version-243-for-elasticsearch-14) |
|
||||
| < 1.4.5 | 2.4.2 | [2.4.2](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.4.2/#version-242-for-elasticsearch-14) |
|
||||
| < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.4.1/#version-241-for-elasticsearch-14) |
|
||||
| es-1.3 | 2.3.0 | [2.3.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.3.0/#icu-analysis-for-elasticsearch) |
|
||||
| es-1.2 | 2.2.0 | [2.2.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.2.0/#icu-analysis-for-elasticsearch) |
|
||||
| es-1.1 | 2.1.0 | [2.1.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.1.0/#icu-analysis-for-elasticsearch) |
|
||||
| es-1.0 | 2.0.0 | [2.0.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v2.0.0/#icu-analysis-for-elasticsearch) |
|
||||
| es-0.90 | 1.13.0 | [1.13.0](https://github.com/elastic/elasticsearch-analysis-icu/tree/v1.13.0/#icu-analysis-for-elasticsearch) |
|
||||
|
||||
To build a `SNAPSHOT` version, you need to build it with Maven:
|
||||
|
||||
```bash
|
||||
mvn clean install
|
||||
plugin --install analysis-icu \
|
||||
--url file:target/releases/elasticsearch-analysis-icu-X.X.X-SNAPSHOT.zip
|
||||
```
|
||||
|
||||
|
||||
ICU Normalization
|
||||
-----------------
|
||||
|
||||
Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization). It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings. Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`. Here is a sample settings:
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"normalized" : {
|
||||
"tokenizer" : "keyword",
|
||||
"filter" : ["icu_normalizer"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
ICU Folding
|
||||
-----------
|
||||
|
||||
Folding of unicode characters based on `UTR#30`. It registers itself under `icu_folding` and `icuFolding` names. Sample setting:
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"folded" : {
|
||||
"tokenizer" : "keyword",
|
||||
"filter" : ["icu_folding"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
ICU Filtering
|
||||
-------------
|
||||
|
||||
The folding can be filtered by a set of unicode characters with the parameter `unicodeSetFilter`. This is useful for a
|
||||
non-internationalized search engine where retaining a set of national characters which are primary letters in a specific
|
||||
language is wanted. See syntax for the UnicodeSet [here](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html).
|
||||
|
||||
The Following example exempts Swedish characters from the folding. Note that the filtered characters are NOT lowercased which is why we add that filter below.
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"folding" : {
|
||||
"tokenizer" : "standard",
|
||||
"filter" : ["my_icu_folding", "lowercase"]
|
||||
}
|
||||
}
|
||||
"filter" : {
|
||||
"my_icu_folding" : {
|
||||
"type" : "icu_folding"
|
||||
"unicodeSetFilter" : "[^åäöÅÄÖ]"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
ICU Collation
|
||||
-------------
|
||||
|
||||
Uses collation token filter. Allows to either specify the rules for collation
|
||||
(defined [here](http://www.icu-project.org/userguide/Collate_Customization.html)) using the `rules` parameter
|
||||
(can point to a location or expressed in the settings, location can be relative to config location), or using the
|
||||
`language` parameter (further specialized by country and variant). By default registers under `icu_collation` or
|
||||
`icuCollation` and uses the default locale.
|
||||
|
||||
Here is a sample settings:
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"collation" : {
|
||||
"tokenizer" : "keyword",
|
||||
"filter" : ["icu_collation"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
And here is a sample of custom collation:
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"collation" : {
|
||||
"tokenizer" : "keyword",
|
||||
"filter" : ["myCollator"]
|
||||
}
|
||||
},
|
||||
"filter" : {
|
||||
"myCollator" : {
|
||||
"type" : "icu_collation",
|
||||
"language" : "en"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Optional options:
|
||||
* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
|
||||
The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
|
||||
Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
|
||||
See [ICU Collation](http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html) documentation for a more detailed
|
||||
explanation for the specific values.
|
||||
* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
|
||||
`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
|
||||
normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
|
||||
before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
|
||||
faster and more complete collation behavior. Since a great many of the world's languages do not require text
|
||||
normalization, most locales set `no` as the default decomposition mode.
|
||||
|
||||
Expert options:
|
||||
* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
|
||||
to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
|
||||
* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
|
||||
strength is set to `primary` this will ignore accent differences.
|
||||
* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
|
||||
for strength `tertiary`.
|
||||
* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
|
||||
example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
|
||||
* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
|
||||
* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
|
||||
and Hiragana characters in `quaternary` strength .
|
||||
|
||||
ICU Tokenizer
|
||||
-------------
|
||||
|
||||
Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://www.unicode.org/reports/tr29/).
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"tokenized" : {
|
||||
"tokenizer" : "icu_tokenizer",
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
ICU Normalization CharFilter
|
||||
-----------------
|
||||
|
||||
Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization).
|
||||
It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings.
|
||||
Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`.
|
||||
Allows for the mode parameter to be provided which can include the following values: `compose` and `decompose`.
|
||||
Use `decompose` with `nfc` or `nfkc`, to get `nfd` or `nfkd`, respectively.
|
||||
Here is a sample settings:
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"normalized" : {
|
||||
"tokenizer" : "keyword",
|
||||
"char_filter" : ["icu_normalizer"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
ICU Transform
|
||||
-------------
|
||||
Transforms are used to process Unicode text in many different ways. Some include case mapping, normalization,
|
||||
transliteration and bidirectional text handling.
|
||||
|
||||
You can defined transliterator identifiers by using `id` property, and specify direction to `forward` or `reverse` by
|
||||
using `dir` property, The default value of both properties are `Null` and `forward`.
|
||||
|
||||
For example:
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"latin" : {
|
||||
"tokenizer" : "keyword",
|
||||
"filter" : ["myLatinTransform"]
|
||||
}
|
||||
},
|
||||
"filter" : {
|
||||
"myLatinTransform" : {
|
||||
"type" : "icu_transform",
|
||||
"id" : "Any-Latin; NFD; [:Nonspacing Mark:] Remove; NFC"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This transform transliterated characters to latin, and separates accents from their base characters, removes the accents,
|
||||
and then puts the remaining text into an unaccented form.
|
||||
|
||||
The results are:
|
||||
|
||||
`你好` to `ni hao`
|
||||
|
||||
`здравствуйте` to `zdravstvujte`
|
||||
|
||||
`こんにちは` to `kon'nichiha`
|
||||
|
||||
Currently the filter only supports identifier and direction, custom rulesets are not yet supported.
|
||||
|
||||
For more documentation, Please see the [user guide of ICU Transform](http://userguide.icu-project.org/transforms/general).
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
This software is licensed under the Apache 2 license, quoted below.
|
||||
|
||||
Copyright 2009-2014 Elasticsearch <http://www.elasticsearch.org>
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
use this file except in compliance with the License. You may obtain a copy of
|
||||
the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
License for the specific language governing permissions and limitations under
|
||||
the License.
|
|
@ -0,0 +1,57 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.elasticsearch.plugin</groupId>
|
||||
<artifactId>elasticsearch-analysis-icu</artifactId>
|
||||
|
||||
<packaging>jar</packaging>
|
||||
<name>Elasticsearch ICU Analysis plugin</name>
|
||||
<description>The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components.</description>
|
||||
|
||||
<parent>
|
||||
<groupId>org.elasticsearch</groupId>
|
||||
<artifactId>elasticsearch-plugin</artifactId>
|
||||
<version>2.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<properties>
|
||||
<tests.jvms>1</tests.jvms>
|
||||
<es.logger.level>INFO</es.logger.level>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analyzers-icu</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>com.mycila</groupId>
|
||||
<artifactId>license-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<excludes>
|
||||
<!-- TODO: https://github.com/elastic/elasticsearch-analysis-icu/issues/29 -->
|
||||
<exclude>**/IndexableBinaryStringTools.java</exclude>
|
||||
<exclude>**/ICUCollationKeyFilter.java</exclude>
|
||||
<exclude>**/TestIndexableBinaryStringTools.java</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,26 @@
|
|||
<?xml version="1.0"?>
|
||||
<assembly>
|
||||
<id>plugin</id>
|
||||
<formats>
|
||||
<format>zip</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<dependencySets>
|
||||
<dependencySet>
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<useProjectArtifact>true</useProjectArtifact>
|
||||
<useTransitiveFiltering>true</useTransitiveFiltering>
|
||||
<excludes>
|
||||
<exclude>org.elasticsearch:elasticsearch</exclude>
|
||||
</excludes>
|
||||
</dependencySet>
|
||||
<dependencySet>
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<useProjectArtifact>true</useProjectArtifact>
|
||||
<useTransitiveFiltering>true</useTransitiveFiltering>
|
||||
<includes>
|
||||
<include>org.apache.lucene:lucene-analyzers-icu</include>
|
||||
</includes>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
|
@ -0,0 +1,109 @@
|
|||
package org.elasticsearch.index.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.RawCollationKey;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
|
||||
* then encodes the CollationKey with {@link IndexableBinaryStringTools}, to
|
||||
* allow it to be stored as an index term.
|
||||
* </p>
|
||||
* <p>
|
||||
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
|
||||
* index and query time -- CollationKeys are only comparable when produced by
|
||||
* the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
|
||||
* independently versioned, so it is safe to search against stored
|
||||
* CollationKeys if the following are exactly the same (best practice is
|
||||
* to store this information with the index and check that they remain the
|
||||
* same at query time):
|
||||
* </p>
|
||||
* <ol>
|
||||
* <li>
|
||||
* Collator version - see {@link Collator#getVersion()}
|
||||
* </li>
|
||||
* <li>
|
||||
* The collation strength used - see {@link Collator#setStrength(int)}
|
||||
* </li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* CollationKeys generated by ICU Collators are not compatible with those
|
||||
* generated by java.text.Collators. Specifically, if you use
|
||||
* ICUCollationKeyFilter to generate index terms, do not use
|
||||
* {@code CollationKeyFilter} on the query side, or vice versa.
|
||||
* </p>
|
||||
* <p>
|
||||
* ICUCollationKeyFilter is significantly faster and generates significantly
|
||||
* shorter keys than CollationKeyFilter. See
|
||||
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
|
||||
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
|
||||
* generation timing and key length comparisons between ICU4J and
|
||||
* java.text.Collator over several languages.
|
||||
* </p>
|
||||
* @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
|
||||
* terms directly as bytes. This filter WAS removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class ICUCollationKeyFilter extends TokenFilter {
|
||||
private Collator collator = null;
|
||||
private RawCollationKey reusableKey = new RawCollationKey();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input Source token stream
|
||||
* @param collator CollationKey generator
|
||||
*/
|
||||
public ICUCollationKeyFilter(TokenStream input, Collator collator) {
|
||||
super(input);
|
||||
// clone the collator: see http://userguide.icu-project.org/collation/architecture
|
||||
try {
|
||||
this.collator = (Collator) collator.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
char[] termBuffer = termAtt.buffer();
|
||||
String termText = new String(termBuffer, 0, termAtt.length());
|
||||
collator.getRawCollationKey(termText, reusableKey);
|
||||
int encodedLength = IndexableBinaryStringTools.getEncodedLength(
|
||||
reusableKey.bytes, 0, reusableKey.size);
|
||||
if (encodedLength > termBuffer.length) {
|
||||
termAtt.resizeBuffer(encodedLength);
|
||||
}
|
||||
termAtt.setLength(encodedLength);
|
||||
IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size,
|
||||
termAtt.buffer(), 0, encodedLength);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
/**
|
||||
*/
|
||||
public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
|
||||
|
||||
@Override
|
||||
public void processCharFilters(CharFiltersBindings charFiltersBindings) {
|
||||
charFiltersBindings.processCharFilter("icu_normalizer", IcuNormalizerCharFilterFactory.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
||||
tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
||||
tokenFiltersBindings.processTokenFilter("icu_normalizer", IcuNormalizerTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("icu_folding", IcuFoldingTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("icu_collation", IcuCollationTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("icu_transform", IcuTransformTokenFilterFactory.class);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,178 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.RuleBasedCollator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.io.Streams;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.env.FailedToResolveConfigException;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.file.Files;
|
||||
|
||||
/**
|
||||
* An ICU based collation token filter. There are two ways to configure collation:
|
||||
* <p/>
|
||||
* <p>The first is simply specifying the locale (defaults to the default locale). The <tt>language</tt>
|
||||
* parameter is the lowercase two-letter ISO-639 code. An additional <tt>country</tt> and <tt>variant</tt>
|
||||
* can be provided.
|
||||
* <p/>
|
||||
* <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
|
||||
* Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
|
||||
* in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
|
||||
*/
|
||||
public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final Collator collator;
|
||||
|
||||
@Inject
|
||||
public IcuCollationTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment environment, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
|
||||
Collator collator;
|
||||
String rules = settings.get("rules");
|
||||
if (rules != null) {
|
||||
Exception failureToResolve = null;
|
||||
try {
|
||||
rules = Streams.copyToString(Files.newBufferedReader(environment.configFile().resolve(rules), Charset.forName("UTF-8")));
|
||||
} catch (FailedToResolveConfigException | IOException | SecurityException e) {
|
||||
failureToResolve = e;
|
||||
}
|
||||
try {
|
||||
collator = new RuleBasedCollator(rules);
|
||||
} catch (Exception e) {
|
||||
if (failureToResolve != null) {
|
||||
throw new IllegalArgumentException("Failed to resolve collation rules location", failureToResolve);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Failed to parse collation rules", e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
String language = settings.get("language");
|
||||
if (language != null) {
|
||||
ULocale locale;
|
||||
String country = settings.get("country");
|
||||
if (country != null) {
|
||||
String variant = settings.get("variant");
|
||||
if (variant != null) {
|
||||
locale = new ULocale(language, country, variant);
|
||||
} else {
|
||||
locale = new ULocale(language, country);
|
||||
}
|
||||
} else {
|
||||
locale = new ULocale(language);
|
||||
}
|
||||
collator = Collator.getInstance(locale);
|
||||
} else {
|
||||
collator = Collator.getInstance();
|
||||
}
|
||||
}
|
||||
|
||||
// set the strength flag, otherwise it will be the default.
|
||||
String strength = settings.get("strength");
|
||||
if (strength != null) {
|
||||
if (strength.equalsIgnoreCase("primary")) {
|
||||
collator.setStrength(Collator.PRIMARY);
|
||||
} else if (strength.equalsIgnoreCase("secondary")) {
|
||||
collator.setStrength(Collator.SECONDARY);
|
||||
} else if (strength.equalsIgnoreCase("tertiary")) {
|
||||
collator.setStrength(Collator.TERTIARY);
|
||||
} else if (strength.equalsIgnoreCase("quaternary")) {
|
||||
collator.setStrength(Collator.QUATERNARY);
|
||||
} else if (strength.equalsIgnoreCase("identical")) {
|
||||
collator.setStrength(Collator.IDENTICAL);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Invalid strength: " + strength);
|
||||
}
|
||||
}
|
||||
|
||||
// set the decomposition flag, otherwise it will be the default.
|
||||
String decomposition = settings.get("decomposition");
|
||||
if (decomposition != null) {
|
||||
if (decomposition.equalsIgnoreCase("no")) {
|
||||
collator.setDecomposition(Collator.NO_DECOMPOSITION);
|
||||
} else if (decomposition.equalsIgnoreCase("canonical")) {
|
||||
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Invalid decomposition: " + decomposition);
|
||||
}
|
||||
}
|
||||
|
||||
// expert options: concrete subclasses are always a RuleBasedCollator
|
||||
RuleBasedCollator rbc = (RuleBasedCollator) collator;
|
||||
String alternate = settings.get("alternate");
|
||||
if (alternate != null) {
|
||||
if (alternate.equalsIgnoreCase("shifted")) {
|
||||
rbc.setAlternateHandlingShifted(true);
|
||||
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
|
||||
rbc.setAlternateHandlingShifted(false);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Invalid alternate: " + alternate);
|
||||
}
|
||||
}
|
||||
|
||||
Boolean caseLevel = settings.getAsBoolean("caseLevel", null);
|
||||
if (caseLevel != null) {
|
||||
rbc.setCaseLevel(caseLevel);
|
||||
}
|
||||
|
||||
String caseFirst = settings.get("caseFirst");
|
||||
if (caseFirst != null) {
|
||||
if (caseFirst.equalsIgnoreCase("lower")) {
|
||||
rbc.setLowerCaseFirst(true);
|
||||
} else if (caseFirst.equalsIgnoreCase("upper")) {
|
||||
rbc.setUpperCaseFirst(true);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Invalid caseFirst: " + caseFirst);
|
||||
}
|
||||
}
|
||||
|
||||
Boolean numeric = settings.getAsBoolean("numeric", null);
|
||||
if (numeric != null) {
|
||||
rbc.setNumericCollation(numeric);
|
||||
}
|
||||
|
||||
String variableTop = settings.get("variableTop");
|
||||
if (variableTop != null) {
|
||||
rbc.setVariableTop(variableTop);
|
||||
}
|
||||
|
||||
Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null);
|
||||
if (hiraganaQuaternaryMode != null) {
|
||||
rbc.setHiraganaQuaternary(hiraganaQuaternaryMode);
|
||||
}
|
||||
|
||||
this.collator = collator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ICUCollationKeyFilter(tokenStream, collator);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import com.ibm.icu.text.FilteredNormalizer2;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
|
||||
/**
|
||||
* Uses the {@link org.apache.lucene.analysis.icu.ICUFoldingFilter}.
|
||||
* Applies foldings from UTR#30 Character Foldings.
|
||||
* <p>
|
||||
* Can be filtered to handle certain characters in a specified way (see http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html)
|
||||
* E.g national chars that should be retained (filter : "[^åäöÅÄÖ]").
|
||||
*
|
||||
* <p>The <tt>unicodeSetFilter</tt> attribute can be used to provide the UniCodeSet for filtering.
|
||||
*
|
||||
* @author kimchy (shay.banon)
|
||||
*/
|
||||
public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
private final String unicodeSetFilter;
|
||||
|
||||
@Inject public IcuFoldingTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
this.unicodeSetFilter = settings.get("unicodeSetFilter");
|
||||
}
|
||||
|
||||
@Override public TokenStream create(TokenStream tokenStream) {
|
||||
|
||||
// The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter.
|
||||
// ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here
|
||||
if (unicodeSetFilter != null) {
|
||||
Normalizer2 base = Normalizer2.getInstance(
|
||||
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
|
||||
"utr30", Normalizer2.Mode.COMPOSE);
|
||||
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
|
||||
|
||||
unicodeSet.freeze();
|
||||
Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet);
|
||||
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered);
|
||||
}
|
||||
else {
|
||||
return new ICUFoldingFilter(tokenStream);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
|
||||
/**
|
||||
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character.
|
||||
* <p/>
|
||||
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
|
||||
* <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
|
||||
*/
|
||||
public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory {
|
||||
|
||||
private final String name;
|
||||
|
||||
private final Normalizer2 normalizer;
|
||||
|
||||
|
||||
@Inject
|
||||
public IcuNormalizerCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name);
|
||||
this.name = settings.get("name", "nfkc_cf");
|
||||
String mode = settings.get("mode");
|
||||
if (!"compose".equals(mode) && !"decompose".equals(mode)) {
|
||||
mode = "compose";
|
||||
}
|
||||
this.normalizer = Normalizer2.getInstance(
|
||||
null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader create(Reader reader) {
|
||||
return new ICUNormalizer2CharFilter(reader, normalizer);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
|
||||
/**
|
||||
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to normalize tokens.
|
||||
* <p/>
|
||||
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.
|
||||
*
|
||||
*
|
||||
*/
|
||||
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final String name;
|
||||
|
||||
@Inject
|
||||
public IcuNormalizerTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
this.name = settings.get("name", "nfkc_cf");
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
*/
|
||||
public class IcuTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
@Inject
|
||||
public IcuTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create() {
|
||||
return new ICUTokenizer();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.icu.ICUTransformFilter;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
|
||||
/**
|
||||
*/
|
||||
public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final String id;
|
||||
private final int dir;
|
||||
private final Transliterator transliterator;
|
||||
|
||||
@Inject
|
||||
public IcuTransformTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
this.id = settings.get("id", "Null");
|
||||
String s = settings.get("dir", "forward");
|
||||
this.dir = "forward".equals(s) ? Transliterator.FORWARD : Transliterator.REVERSE;
|
||||
this.transliterator = Transliterator.getInstance(id, dir);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ICUTransformFilter(tokenStream, transliterator);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,241 @@
|
|||
package org.elasticsearch.index.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadoc
|
||||
|
||||
/**
|
||||
* Provides support for converting byte sequences to Strings and back again.
|
||||
* The resulting Strings preserve the original byte sequences' sort order.
|
||||
* <p/>
|
||||
* The Strings are constructed using a Base 8000h encoding of the original
|
||||
* binary data - each char of an encoded String represents a 15-bit chunk
|
||||
* from the byte sequence. Base 8000h was chosen because it allows for all
|
||||
* lower 15 bits of char to be used without restriction; the surrogate range
|
||||
* [U+D8000-U+DFFF] does not represent valid chars, and would require
|
||||
* complicated handling to avoid them and allow use of char's high bit.
|
||||
* <p/>
|
||||
* Although unset bits are used as padding in the final char, the original
|
||||
* byte sequence could contain trailing bytes with no set bits (null bytes):
|
||||
* padding is indistinguishable from valid information. To overcome this
|
||||
* problem, a char is appended, indicating the number of encoded bytes in the
|
||||
* final content char.
|
||||
* <p/>
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @deprecated Implement {@link TermToBytesRefAttribute} and store bytes directly
|
||||
* instead. This class WAS removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class IndexableBinaryStringTools {
|
||||
|
||||
private static final CodingCase[] CODING_CASES = {
|
||||
// CodingCase(int initialShift, int finalShift)
|
||||
new CodingCase( 7, 1 ),
|
||||
// CodingCase(int initialShift, int middleShift, int finalShift)
|
||||
new CodingCase(14, 6, 2),
|
||||
new CodingCase(13, 5, 3),
|
||||
new CodingCase(12, 4, 4),
|
||||
new CodingCase(11, 3, 5),
|
||||
new CodingCase(10, 2, 6),
|
||||
new CodingCase( 9, 1, 7),
|
||||
new CodingCase( 8, 0 )
|
||||
};
|
||||
|
||||
// Export only static methods
|
||||
private IndexableBinaryStringTools() {}
|
||||
|
||||
/**
|
||||
* Returns the number of chars required to encode the given bytes.
|
||||
*
|
||||
* @param inputArray byte sequence to be encoded
|
||||
* @param inputOffset initial offset into inputArray
|
||||
* @param inputLength number of bytes in inputArray
|
||||
* @return The number of chars required to encode the number of bytes.
|
||||
*/
|
||||
public static int getEncodedLength(byte[] inputArray, int inputOffset,
|
||||
int inputLength) {
|
||||
// Use long for intermediaries to protect against overflow
|
||||
return (int)((8L * inputLength + 14L) / 15L) + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of bytes required to decode the given char sequence.
|
||||
*
|
||||
* @param encoded char sequence to be decoded
|
||||
* @param offset initial offset
|
||||
* @param length number of characters
|
||||
* @return The number of bytes required to decode the given char sequence
|
||||
*/
|
||||
public static int getDecodedLength(char[] encoded, int offset, int length) {
|
||||
final int numChars = length - 1;
|
||||
if (numChars <= 0) {
|
||||
return 0;
|
||||
} else {
|
||||
// Use long for intermediaries to protect against overflow
|
||||
final long numFullBytesInFinalChar = encoded[offset + length - 1];
|
||||
final long numEncodedChars = numChars - 1;
|
||||
return (int)((numEncodedChars * 15L + 7L) / 8L + numFullBytesInFinalChar);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes the input byte sequence into the output char sequence. Before
|
||||
* calling this method, ensure that the output array has sufficient
|
||||
* capacity by calling {@link #getEncodedLength(byte[], int, int)}.
|
||||
*
|
||||
* @param inputArray byte sequence to be encoded
|
||||
* @param inputOffset initial offset into inputArray
|
||||
* @param inputLength number of bytes in inputArray
|
||||
* @param outputArray char sequence to store encoded result
|
||||
* @param outputOffset initial offset into outputArray
|
||||
* @param outputLength length of output, must be getEncodedLength
|
||||
*/
|
||||
public static void encode(byte[] inputArray, int inputOffset,
|
||||
int inputLength, char[] outputArray, int outputOffset, int outputLength) {
|
||||
assert (outputLength == getEncodedLength(inputArray, inputOffset,
|
||||
inputLength));
|
||||
if (inputLength > 0) {
|
||||
int inputByteNum = inputOffset;
|
||||
int caseNum = 0;
|
||||
int outputCharNum = outputOffset;
|
||||
CodingCase codingCase;
|
||||
for (; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength; ++outputCharNum) {
|
||||
codingCase = CODING_CASES[caseNum];
|
||||
if (2 == codingCase.numBytes) {
|
||||
outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
|
||||
+ (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
|
||||
} else { // numBytes is 3
|
||||
outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
|
||||
+ ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)
|
||||
+ (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF);
|
||||
}
|
||||
inputByteNum += codingCase.advanceBytes;
|
||||
if (++caseNum == CODING_CASES.length) {
|
||||
caseNum = 0;
|
||||
}
|
||||
}
|
||||
// Produce final char (if any) and trailing count chars.
|
||||
codingCase = CODING_CASES[caseNum];
|
||||
|
||||
if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3
|
||||
outputArray[outputCharNum++] = (char) ((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF);
|
||||
// Add trailing char containing the number of full bytes in final char
|
||||
outputArray[outputCharNum++] = (char) 1;
|
||||
} else if (inputByteNum < inputLength) {
|
||||
outputArray[outputCharNum++] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF);
|
||||
// Add trailing char containing the number of full bytes in final char
|
||||
outputArray[outputCharNum++] = caseNum == 0 ? (char) 1 : (char) 0;
|
||||
} else { // No left over bits - last char is completely filled.
|
||||
// Add trailing char containing the number of full bytes in final char
|
||||
outputArray[outputCharNum++] = (char) 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the input char sequence into the output byte sequence. Before
|
||||
* calling this method, ensure that the output array has sufficient capacity
|
||||
* by calling {@link #getDecodedLength(char[], int, int)}.
|
||||
*
|
||||
* @param inputArray char sequence to be decoded
|
||||
* @param inputOffset initial offset into inputArray
|
||||
* @param inputLength number of chars in inputArray
|
||||
* @param outputArray byte sequence to store encoded result
|
||||
* @param outputOffset initial offset into outputArray
|
||||
* @param outputLength length of output, must be
|
||||
* getDecodedLength(inputArray, inputOffset, inputLength)
|
||||
*/
|
||||
public static void decode(char[] inputArray, int inputOffset,
|
||||
int inputLength, byte[] outputArray, int outputOffset, int outputLength) {
|
||||
assert (outputLength == getDecodedLength(inputArray, inputOffset,
|
||||
inputLength));
|
||||
final int numInputChars = inputLength - 1;
|
||||
final int numOutputBytes = outputLength;
|
||||
|
||||
if (numOutputBytes > 0) {
|
||||
int caseNum = 0;
|
||||
int outputByteNum = outputOffset;
|
||||
int inputCharNum = inputOffset;
|
||||
short inputChar;
|
||||
CodingCase codingCase;
|
||||
for (; inputCharNum < numInputChars - 1; ++inputCharNum) {
|
||||
codingCase = CODING_CASES[caseNum];
|
||||
inputChar = (short) inputArray[inputCharNum];
|
||||
if (2 == codingCase.numBytes) {
|
||||
if (0 == caseNum) {
|
||||
outputArray[outputByteNum] = (byte) (inputChar >>> codingCase.initialShift);
|
||||
} else {
|
||||
outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
|
||||
}
|
||||
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
|
||||
} else { // numBytes is 3
|
||||
outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
|
||||
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
|
||||
outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
|
||||
}
|
||||
outputByteNum += codingCase.advanceBytes;
|
||||
if (++caseNum == CODING_CASES.length) {
|
||||
caseNum = 0;
|
||||
}
|
||||
}
|
||||
// Handle final char
|
||||
inputChar = (short) inputArray[inputCharNum];
|
||||
codingCase = CODING_CASES[caseNum];
|
||||
if (0 == caseNum) {
|
||||
outputArray[outputByteNum] = 0;
|
||||
}
|
||||
outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift);
|
||||
final int bytesLeft = numOutputBytes - outputByteNum;
|
||||
if (bytesLeft > 1) {
|
||||
if (2 == codingCase.numBytes) {
|
||||
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) >>> codingCase.finalShift);
|
||||
} else { // numBytes is 3
|
||||
outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift);
|
||||
if (bytesLeft > 2) {
|
||||
outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static class CodingCase {
|
||||
int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2;
|
||||
short middleMask, finalMask;
|
||||
|
||||
CodingCase(int initialShift, int middleShift, int finalShift) {
|
||||
this.numBytes = 3;
|
||||
this.initialShift = initialShift;
|
||||
this.middleShift = middleShift;
|
||||
this.finalShift = finalShift;
|
||||
this.finalMask = (short)((short)0xFF >>> finalShift);
|
||||
this.middleMask = (short)((short)0xFF << middleShift);
|
||||
}
|
||||
|
||||
CodingCase(int initialShift, int finalShift) {
|
||||
this.numBytes = 2;
|
||||
this.initialShift = initialShift;
|
||||
this.finalShift = finalShift;
|
||||
this.finalMask = (short)((short)0xFF >>> finalShift);
|
||||
if (finalShift != 0) {
|
||||
advanceBytes = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.indices.analysis;
|
||||
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
|
||||
import org.apache.lucene.analysis.icu.ICUTransformFilter;
|
||||
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
|
||||
import org.elasticsearch.common.component.AbstractComponent;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.analysis.ICUCollationKeyFilter;
|
||||
import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
|
||||
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
|
||||
/**
|
||||
* Registers indices level analysis components so, if not explicitly configured, will be shared
|
||||
* among all indices.
|
||||
*/
|
||||
public class IcuIndicesAnalysis extends AbstractComponent {
|
||||
|
||||
@Inject
|
||||
public IcuIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) {
|
||||
super(settings);
|
||||
|
||||
indicesAnalysisService.tokenizerFactories().put("icu_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "icu_tokenizer";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create() {
|
||||
return new ICUTokenizer();
|
||||
}
|
||||
}));
|
||||
|
||||
indicesAnalysisService.tokenFilterFactories().put("icu_normalizer", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "icu_normalizer";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
|
||||
}
|
||||
}));
|
||||
|
||||
|
||||
indicesAnalysisService.tokenFilterFactories().put("icu_folding", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "icu_folding";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ICUFoldingFilter(tokenStream);
|
||||
}
|
||||
}));
|
||||
|
||||
indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "icu_collation";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ICUCollationKeyFilter(tokenStream, Collator.getInstance());
|
||||
}
|
||||
}));
|
||||
|
||||
indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "icu_transform";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ICUTransformFilter(tokenStream, Transliterator.getInstance("Null", Transliterator.FORWARD));
|
||||
}
|
||||
}));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.indices.analysis;
|
||||
|
||||
import org.elasticsearch.common.inject.AbstractModule;
|
||||
|
||||
/**
|
||||
*/
|
||||
public class IcuIndicesAnalysisModule extends AbstractModule {
|
||||
|
||||
@Override
|
||||
protected void configure() {
|
||||
bind(IcuIndicesAnalysis.class).asEagerSingleton();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.plugin.analysis.icu;
|
||||
|
||||
import org.elasticsearch.common.inject.Module;
|
||||
import org.elasticsearch.index.analysis.AnalysisModule;
|
||||
import org.elasticsearch.index.analysis.IcuAnalysisBinderProcessor;
|
||||
import org.elasticsearch.indices.analysis.IcuIndicesAnalysisModule;
|
||||
import org.elasticsearch.plugins.AbstractPlugin;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class AnalysisICUPlugin extends AbstractPlugin {
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return "analysis-icu";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String description() {
|
||||
return "UTF related ICU analysis support";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Class<? extends Module>> modules() {
|
||||
Collection<Class<? extends Module>> classes = new ArrayList<>();
|
||||
classes.add(IcuIndicesAnalysisModule.class);
|
||||
return classes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Automatically called with the analysis module.
|
||||
*/
|
||||
public void onModule(AnalysisModule module) {
|
||||
module.addProcessor(new IcuAnalysisBinderProcessor());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
plugin=org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin
|
||||
version=${project.version}
|
||||
lucene=${lucene.version}
|
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.inject.Injector;
|
||||
import org.elasticsearch.common.inject.ModulesBuilder;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.settings.SettingsModule;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.env.EnvironmentModule;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.IndexNameModule;
|
||||
import org.elasticsearch.index.settings.IndexSettingsModule;
|
||||
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
|
||||
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
||||
|
||||
import static org.elasticsearch.common.settings.Settings.settingsBuilder;
|
||||
|
||||
public class AnalysisTestUtils {
|
||||
|
||||
public static AnalysisService createAnalysisService(Settings settings) {
|
||||
Index index = new Index("test");
|
||||
Settings indexSettings = settingsBuilder().put(settings)
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.build();
|
||||
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
|
||||
Injector injector = new ModulesBuilder().add(
|
||||
new IndexSettingsModule(index, indexSettings),
|
||||
new IndexNameModule(index),
|
||||
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
|
||||
.createChildInjector(parentInjector);
|
||||
|
||||
return injector.getInstance(AnalysisService.class);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse;
|
||||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
|
||||
import org.elasticsearch.action.search.SearchResponse;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.index.query.QueryBuilders;
|
||||
import org.elasticsearch.plugins.PluginsService;
|
||||
import org.elasticsearch.test.ElasticsearchIntegrationTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||
import static org.hamcrest.CoreMatchers.is;
|
||||
import static org.hamcrest.CoreMatchers.notNullValue;
|
||||
|
||||
@ElasticsearchIntegrationTest.ClusterScope(scope = ElasticsearchIntegrationTest.Scope.SUITE)
|
||||
public class ICUIntegrationTests extends ElasticsearchIntegrationTest {
|
||||
|
||||
@Override
|
||||
protected Settings nodeSettings(int nodeOrdinal) {
|
||||
return Settings.builder()
|
||||
.put(super.nodeSettings(nodeOrdinal))
|
||||
.put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Settings indexSettings() {
|
||||
Settings settings = Settings.builder()
|
||||
.put(super.indexSettings())
|
||||
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_collator")
|
||||
.put("index.analysis.filter.my_collator.type", "icu_collation")
|
||||
.put("index.analysis.filter.my_collator.language", "en")
|
||||
.put("index.analysis.filter.my_collator.strength", "primary")
|
||||
.build();
|
||||
|
||||
return settings;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testICUAnalyzer() throws ExecutionException, InterruptedException {
|
||||
createIndex("test");
|
||||
ensureGreen("test");
|
||||
AnalyzeResponse response1 = client().admin().indices()
|
||||
.prepareAnalyze("Bâton enflammé")
|
||||
.setIndex("test")
|
||||
.setAnalyzer("my_analyzer")
|
||||
.execute().get();
|
||||
AnalyzeResponse response2 = client().admin().indices()
|
||||
.prepareAnalyze("baton enflamme")
|
||||
.setIndex("test")
|
||||
.setAnalyzer("my_analyzer")
|
||||
.execute().get();
|
||||
|
||||
assertThat(response1, notNullValue());
|
||||
assertThat(response2, notNullValue());
|
||||
assertThat(response1.getTokens().size(), is(response2.getTokens().size()));
|
||||
|
||||
for (int i = 0; i < response2.getTokens().size(); i++) {
|
||||
assertThat(response1.getTokens().get(i).getTerm(), is(response2.getTokens().get(i).getTerm()));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testICUAnalyzerInMapping() throws ExecutionException, InterruptedException, IOException {
|
||||
createIndex("test");
|
||||
ensureGreen("test");
|
||||
final XContentBuilder mapping = jsonBuilder().startObject()
|
||||
.startObject("type")
|
||||
.startObject("properties")
|
||||
.startObject("foo")
|
||||
.field("type", "string")
|
||||
.field("analyzer", "my_analyzer")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject();
|
||||
|
||||
client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get();
|
||||
|
||||
index("test", "type", "1", "foo", "Bâton enflammé");
|
||||
refresh();
|
||||
|
||||
SearchResponse response = client().prepareSearch("test").setQuery(
|
||||
QueryBuilders.matchQuery("foo", "baton enflamme")
|
||||
).execute().actionGet();
|
||||
|
||||
assertThat(response.getHits().getTotalHits(), is(1L));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPluginIsLoaded() {
|
||||
NodesInfoResponse infos = client().admin().cluster().prepareNodesInfo().setPlugins(true).execute().actionGet();
|
||||
assertThat(infos.getNodes()[0].getPlugins().getInfos().get(0).getName(), is("analysis-icu"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.elasticsearch.common.settings.Settings.settingsBuilder;
|
||||
import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
/**
|
||||
*/
|
||||
public class SimpleIcuAnalysisTests extends ElasticsearchTestCase {
|
||||
|
||||
@Test
|
||||
public void testDefaultsIcuAnalysis() {
|
||||
Settings settings = settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.loadFromClasspath("org/elasticsearch/index/analysis/phonetic-1.yml").build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
|
||||
assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
|
||||
assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
|
||||
|
||||
filterFactory = analysisService.tokenFilter("icu_folding");
|
||||
assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
|
||||
|
||||
filterFactory = analysisService.tokenFilter("icu_collation");
|
||||
assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
|
||||
|
||||
filterFactory = analysisService.tokenFilter("icu_transform");
|
||||
assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
|
||||
|
||||
CharFilterFactory charFilterFactory = analysisService.charFilter("icu_normalizer");
|
||||
assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,263 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.RuleBasedCollator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
// Tests borrowed from Solr's Icu collation key filter factory test.
|
||||
public class SimpleIcuCollationTokenFilterTests extends ElasticsearchTestCase {
|
||||
|
||||
/*
|
||||
* Turkish has some funny casing.
|
||||
* This test shows how you can solve this kind of thing easily with collation.
|
||||
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
|
||||
* Then things will sort and match correctly.
|
||||
*/
|
||||
@Test
|
||||
public void testBasicUsage() throws Exception {
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "tr")
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
assertCollatesToSame(filterFactory, "I WİLL USE TURKİSH CASING", "ı will use turkish casıng");
|
||||
}
|
||||
|
||||
/*
|
||||
* Test usage of the decomposition option for unicode normalization.
|
||||
*/
|
||||
@Test
|
||||
public void testNormalization() throws IOException {
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "tr")
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.put("index.analysis.filter.myCollator.decomposition", "canonical")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
assertCollatesToSame(filterFactory, "I W\u0049\u0307LL USE TURKİSH CASING", "ı will use turkish casıng");
|
||||
}
|
||||
|
||||
/*
|
||||
* Test secondary strength, for english case is not significant.
|
||||
*/
|
||||
@Test
|
||||
public void testSecondaryStrength() throws IOException {
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.strength", "secondary")
|
||||
.put("index.analysis.filter.myCollator.decomposition", "no")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
assertCollatesToSame(filterFactory, "TESTING", "testing");
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting alternate=shifted to shift whitespace, punctuation and symbols
|
||||
* to quaternary level
|
||||
*/
|
||||
@Test
|
||||
public void testIgnorePunctuation() throws IOException {
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.put("index.analysis.filter.myCollator.alternate", "shifted")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
assertCollatesToSame(filterFactory, "foo-bar", "foo bar");
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting alternate=shifted and variableTop to shift whitespace, but not
|
||||
* punctuation or symbols, to quaternary level
|
||||
*/
|
||||
@Test
|
||||
public void testIgnoreWhitespace() throws IOException {
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.put("index.analysis.filter.myCollator.alternate", "shifted")
|
||||
.put("index.analysis.filter.myCollator.variableTop", " ")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
assertCollatesToSame(filterFactory, "foo bar", "foobar");
|
||||
// now assert that punctuation still matters: foo-bar < foo bar
|
||||
assertCollation(filterFactory, "foo-bar", "foo bar", -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting numeric to encode digits with numeric value, so that
|
||||
* foobar-9 sorts before foobar-10
|
||||
*/
|
||||
@Test
|
||||
public void testNumerics() throws IOException {
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.numeric", "true")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
assertCollation(filterFactory, "foobar-9", "foobar-10", -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting caseLevel=true to create an additional case level between
|
||||
* secondary and tertiary
|
||||
*/
|
||||
@Test
|
||||
public void testIgnoreAccentsButNotCase() throws IOException {
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.put("index.analysis.filter.myCollator.caseLevel", "true")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
assertCollatesToSame(filterFactory, "résumé", "resume");
|
||||
assertCollatesToSame(filterFactory, "Résumé", "Resume");
|
||||
// now assert that case still matters: resume < Resume
|
||||
assertCollation(filterFactory, "resume", "Resume", -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting caseFirst=upper to cause uppercase strings to sort
|
||||
* before lowercase ones.
|
||||
*/
|
||||
@Test
|
||||
public void testUpperCaseFirst() throws IOException {
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.language", "en")
|
||||
.put("index.analysis.filter.myCollator.strength", "tertiary")
|
||||
.put("index.analysis.filter.myCollator.caseFirst", "upper")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
assertCollation(filterFactory, "Resume", "resume", -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* For german, you might want oe to sort and match with o umlaut.
|
||||
* This is not the default, but you can make a customized ruleset to do this.
|
||||
*
|
||||
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
|
||||
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
|
||||
*/
|
||||
@Test
|
||||
public void testCustomRules() throws Exception {
|
||||
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
|
||||
String DIN5007_2_tailorings =
|
||||
"& ae , a\u0308 & AE , A\u0308"+
|
||||
"& oe , o\u0308 & OE , O\u0308"+
|
||||
"& ue , u\u0308 & UE , u\u0308";
|
||||
|
||||
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
|
||||
String tailoredRules = tailoredCollator.getRules();
|
||||
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.filter.myCollator.type", "icu_collation")
|
||||
.put("index.analysis.filter.myCollator.rules", tailoredRules)
|
||||
.put("index.analysis.filter.myCollator.strength", "primary")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
|
||||
assertCollatesToSame(filterFactory, "Töne", "Toene");
|
||||
}
|
||||
|
||||
private void assertCollatesToSame(TokenFilterFactory factory, String string1, String string2) throws IOException {
|
||||
assertCollation(factory, string1, string2, 0);
|
||||
}
|
||||
|
||||
private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison) throws IOException {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
tokenizer.setReader(new StringReader(string1));
|
||||
TokenStream stream1 = factory.create(tokenizer);
|
||||
|
||||
tokenizer = new KeywordTokenizer();
|
||||
tokenizer.setReader(new StringReader(string2));
|
||||
TokenStream stream2 = factory.create(tokenizer);
|
||||
|
||||
assertCollation(stream1, stream2, comparison);
|
||||
}
|
||||
|
||||
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
|
||||
CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
|
||||
CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);
|
||||
|
||||
stream1.reset();
|
||||
stream2.reset();
|
||||
|
||||
assertThat(stream1.incrementToken(), equalTo(true));
|
||||
assertThat(stream2.incrementToken(), equalTo(true));
|
||||
assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
|
||||
assertThat(stream1.incrementToken(), equalTo(false));
|
||||
assertThat(stream2.incrementToken(), equalTo(false));
|
||||
|
||||
stream1.end();
|
||||
stream2.end();
|
||||
|
||||
stream1.close();
|
||||
stream2.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import static org.elasticsearch.index.analysis.AnalysisTestUtils.createAnalysisService;
|
||||
|
||||
/**
|
||||
* Test
|
||||
*/
|
||||
public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase {
|
||||
|
||||
@Test
|
||||
public void testDefaultSetting() throws Exception {
|
||||
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
|
||||
|
||||
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
|
||||
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
|
||||
String expectedOutput = normalizer.normalize(input);
|
||||
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
|
||||
char[] tempBuff = new char[10];
|
||||
StringBuilder output = new StringBuilder();
|
||||
while (true) {
|
||||
int length = inputReader.read(tempBuff);
|
||||
if (length == -1) break;
|
||||
output.append(tempBuff, 0, length);
|
||||
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
|
||||
}
|
||||
assertEquals(expectedOutput, output.toString());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testNameAndModeSetting() throws Exception {
|
||||
|
||||
Settings settings = Settings.settingsBuilder()
|
||||
.put("path.home", createTempDir())
|
||||
.put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
|
||||
.put("index.analysis.char_filter.myNormalizerChar.name", "nfkc")
|
||||
.put("index.analysis.char_filter.myNormalizerChar.mode", "decompose")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(settings);
|
||||
CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
|
||||
|
||||
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
|
||||
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
|
||||
String expectedOutput = normalizer.normalize(input);
|
||||
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
|
||||
char[] tempBuff = new char[10];
|
||||
StringBuilder output = new StringBuilder();
|
||||
while (true) {
|
||||
int length = inputReader.read(tempBuff);
|
||||
if (length == -1) break;
|
||||
output.append(tempBuff, 0, length);
|
||||
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
|
||||
}
|
||||
assertEquals(expectedOutput, output.toString());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,247 @@
|
|||
package org.elasticsearch.index.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.Listeners;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope.Scope;
|
||||
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.TimeUnits;
|
||||
import org.elasticsearch.test.junit.listeners.ReproduceInfoPrinter;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* @deprecated Remove when IndexableBinaryStringTools is removed.
|
||||
*/
|
||||
@Deprecated
|
||||
@Listeners({
|
||||
ReproduceInfoPrinter.class
|
||||
})
|
||||
@ThreadLeakScope(Scope.NONE)
|
||||
@TimeoutSuite(millis = TimeUnits.HOUR)
|
||||
@LuceneTestCase.SuppressSysoutChecks(bugUrl = "we log a lot on purpose")
|
||||
public class TestIndexableBinaryStringTools extends LuceneTestCase {
|
||||
private static int NUM_RANDOM_TESTS;
|
||||
private static int MAX_RANDOM_BINARY_LENGTH;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
NUM_RANDOM_TESTS = atLeast(200);
|
||||
MAX_RANDOM_BINARY_LENGTH = atLeast(300);
|
||||
}
|
||||
|
||||
public void testSingleBinaryRoundTrip() {
|
||||
byte[] binary = new byte[] { (byte) 0x23, (byte) 0x98, (byte) 0x13,
|
||||
(byte) 0xE4, (byte) 0x76, (byte) 0x41, (byte) 0xB2, (byte) 0xC9,
|
||||
(byte) 0x7F, (byte) 0x0A, (byte) 0xA6, (byte) 0xD8 };
|
||||
|
||||
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
|
||||
binary.length);
|
||||
char encoded[] = new char[encodedLen];
|
||||
IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
|
||||
encoded.length);
|
||||
|
||||
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
|
||||
encoded.length);
|
||||
byte decoded[] = new byte[decodedLen];
|
||||
IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
|
||||
decoded.length);
|
||||
|
||||
assertEquals("Round trip decode/decode returned different results:"
|
||||
+ System.getProperty("line.separator") + "original: "
|
||||
+ binaryDump(binary, binary.length)
|
||||
+ System.getProperty("line.separator") + " encoded: "
|
||||
+ charArrayDump(encoded, encoded.length)
|
||||
+ System.getProperty("line.separator") + " decoded: "
|
||||
+ binaryDump(decoded, decoded.length),
|
||||
binaryDump(binary, binary.length), binaryDump(decoded, decoded.length));
|
||||
}
|
||||
|
||||
public void testEncodedSortability() {
|
||||
byte[] originalArray1 = new byte[MAX_RANDOM_BINARY_LENGTH];
|
||||
char[] originalString1 = new char[MAX_RANDOM_BINARY_LENGTH];
|
||||
char[] encoded1 = new char[MAX_RANDOM_BINARY_LENGTH * 10];
|
||||
byte[] original2 = new byte[MAX_RANDOM_BINARY_LENGTH];
|
||||
char[] originalString2 = new char[MAX_RANDOM_BINARY_LENGTH];
|
||||
char[] encoded2 = new char[MAX_RANDOM_BINARY_LENGTH * 10];
|
||||
|
||||
for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) {
|
||||
int numBytes1 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
|
||||
|
||||
for (int byteNum = 0; byteNum < numBytes1; ++byteNum) {
|
||||
int randomInt = random().nextInt(0x100);
|
||||
originalArray1[byteNum] = (byte) randomInt;
|
||||
originalString1[byteNum] = (char) randomInt;
|
||||
}
|
||||
|
||||
int numBytes2 = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
|
||||
|
||||
for (int byteNum = 0; byteNum < numBytes2; ++byteNum) {
|
||||
int randomInt = random().nextInt(0x100);
|
||||
original2[byteNum] = (byte) randomInt;
|
||||
originalString2[byteNum] = (char) randomInt;
|
||||
}
|
||||
int originalComparison = new String(originalString1, 0, numBytes1)
|
||||
.compareTo(new String(originalString2, 0, numBytes2));
|
||||
originalComparison = originalComparison < 0 ? -1
|
||||
: originalComparison > 0 ? 1 : 0;
|
||||
|
||||
int encodedLen1 = IndexableBinaryStringTools.getEncodedLength(
|
||||
originalArray1, 0, numBytes1);
|
||||
if (encodedLen1 > encoded1.length)
|
||||
encoded1 = new char[ArrayUtil.oversize(encodedLen1, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
IndexableBinaryStringTools.encode(originalArray1, 0, numBytes1, encoded1,
|
||||
0, encodedLen1);
|
||||
|
||||
int encodedLen2 = IndexableBinaryStringTools.getEncodedLength(original2,
|
||||
0, numBytes2);
|
||||
if (encodedLen2 > encoded2.length)
|
||||
encoded2 = new char[ArrayUtil.oversize(encodedLen2, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
IndexableBinaryStringTools.encode(original2, 0, numBytes2, encoded2, 0,
|
||||
encodedLen2);
|
||||
|
||||
int encodedComparison = new String(encoded1, 0, encodedLen1)
|
||||
.compareTo(new String(encoded2, 0, encodedLen2));
|
||||
encodedComparison = encodedComparison < 0 ? -1
|
||||
: encodedComparison > 0 ? 1 : 0;
|
||||
|
||||
assertEquals("Test #" + (testNum + 1)
|
||||
+ ": Original bytes and encoded chars compare differently:"
|
||||
+ System.getProperty("line.separator") + " binary 1: "
|
||||
+ binaryDump(originalArray1, numBytes1)
|
||||
+ System.getProperty("line.separator") + " binary 2: "
|
||||
+ binaryDump(original2, numBytes2)
|
||||
+ System.getProperty("line.separator") + "encoded 1: "
|
||||
+ charArrayDump(encoded1, encodedLen1)
|
||||
+ System.getProperty("line.separator") + "encoded 2: "
|
||||
+ charArrayDump(encoded2, encodedLen2)
|
||||
+ System.getProperty("line.separator"), originalComparison,
|
||||
encodedComparison);
|
||||
}
|
||||
}
|
||||
|
||||
public void testEmptyInput() {
|
||||
byte[] binary = new byte[0];
|
||||
|
||||
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
|
||||
binary.length);
|
||||
char[] encoded = new char[encodedLen];
|
||||
IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
|
||||
encoded.length);
|
||||
|
||||
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
|
||||
encoded.length);
|
||||
byte[] decoded = new byte[decodedLen];
|
||||
IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
|
||||
decoded.length);
|
||||
|
||||
assertEquals("decoded empty input was not empty", decoded.length, 0);
|
||||
}
|
||||
|
||||
public void testAllNullInput() {
|
||||
byte[] binary = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
|
||||
binary.length);
|
||||
char encoded[] = new char[encodedLen];
|
||||
IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0,
|
||||
encoded.length);
|
||||
|
||||
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
|
||||
encoded.length);
|
||||
byte[] decoded = new byte[decodedLen];
|
||||
IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0,
|
||||
decoded.length);
|
||||
|
||||
assertEquals("Round trip decode/decode returned different results:"
|
||||
+ System.getProperty("line.separator") + " original: "
|
||||
+ binaryDump(binary, binary.length)
|
||||
+ System.getProperty("line.separator") + "decodedBuf: "
|
||||
+ binaryDump(decoded, decoded.length),
|
||||
binaryDump(binary, binary.length), binaryDump(decoded, decoded.length));
|
||||
}
|
||||
|
||||
public void testRandomBinaryRoundTrip() {
|
||||
byte[] binary = new byte[MAX_RANDOM_BINARY_LENGTH];
|
||||
char[] encoded = new char[MAX_RANDOM_BINARY_LENGTH * 10];
|
||||
byte[] decoded = new byte[MAX_RANDOM_BINARY_LENGTH];
|
||||
for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) {
|
||||
int numBytes = random().nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
|
||||
|
||||
for (int byteNum = 0; byteNum < numBytes; ++byteNum) {
|
||||
binary[byteNum] = (byte) random().nextInt(0x100);
|
||||
}
|
||||
|
||||
int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0,
|
||||
numBytes);
|
||||
if (encoded.length < encodedLen)
|
||||
encoded = new char[ArrayUtil.oversize(encodedLen, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
IndexableBinaryStringTools.encode(binary, 0, numBytes, encoded, 0,
|
||||
encodedLen);
|
||||
|
||||
int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0,
|
||||
encodedLen);
|
||||
IndexableBinaryStringTools.decode(encoded, 0, encodedLen, decoded, 0,
|
||||
decodedLen);
|
||||
|
||||
assertEquals("Test #" + (testNum + 1)
|
||||
+ ": Round trip decode/decode returned different results:"
|
||||
+ System.getProperty("line.separator") + " original: "
|
||||
+ binaryDump(binary, numBytes) + System.getProperty("line.separator")
|
||||
+ "encodedBuf: " + charArrayDump(encoded, encodedLen)
|
||||
+ System.getProperty("line.separator") + "decodedBuf: "
|
||||
+ binaryDump(decoded, decodedLen), binaryDump(binary, numBytes),
|
||||
binaryDump(decoded, decodedLen));
|
||||
}
|
||||
}
|
||||
|
||||
public String binaryDump(byte[] binary, int numBytes) {
|
||||
StringBuilder buf = new StringBuilder();
|
||||
for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) {
|
||||
String hex = Integer.toHexString(binary[byteNum] & 0xFF);
|
||||
if (hex.length() == 1) {
|
||||
buf.append('0');
|
||||
}
|
||||
buf.append(hex.toUpperCase(Locale.ROOT));
|
||||
if (byteNum < numBytes - 1) {
|
||||
buf.append(' ');
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
public String charArrayDump(char[] charArray, int numBytes) {
|
||||
StringBuilder buf = new StringBuilder();
|
||||
for (int charNum = 0 ; charNum < numBytes ; ++charNum) {
|
||||
String hex = Integer.toHexString(charArray[charNum]);
|
||||
for (int digit = 0 ; digit < 4 - hex.length() ; ++digit) {
|
||||
buf.append('0');
|
||||
}
|
||||
buf.append(hex.toUpperCase(Locale.ROOT));
|
||||
if (charNum < numBytes - 1) {
|
||||
buf.append(' ');
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue