Merge branch 'master' of https://github.com/elasticsearch/elasticsearch
|
@ -37,7 +37,7 @@ First of all, DON'T PANIC. It will take 5 minutes to get the gist of what Elasti
|
|||
h3. Installation
|
||||
|
||||
* "Download":http://www.elasticsearch.org/download and unzip the Elasticsearch official distribution.
|
||||
* Run @bin/elasticsearch@ on unix, or @bin/elasticsearch.bat@ on windows.
|
||||
* Run @bin/elasticsearch@ on unix, or @bin\elasticsearch.bat@ on windows.
|
||||
* Run @curl -X GET http://localhost:9200/@.
|
||||
* Start more servers ...
|
||||
|
||||
|
|
|
@ -20,13 +20,13 @@ mvn clean package -DskipTests
|
|||
|
||||
To disable and enable network transport, set the `Des.node.mode`.
|
||||
|
||||
Use network transport (default):
|
||||
Use network transport:
|
||||
|
||||
------------------------------------
|
||||
-Des.node.mode=network
|
||||
------------------------------------
|
||||
|
||||
Use local transport:
|
||||
Use local transport (default since 1.3):
|
||||
|
||||
-------------------------------------
|
||||
-Des.node.mode=local
|
||||
|
@ -62,6 +62,29 @@ Run any test methods that contain 'esi' (like: ...r*esi*ze...).
|
|||
mvn test "-Dtests.method=*esi*"
|
||||
-------------------------------
|
||||
|
||||
You can also filter tests by certain annotations ie:
|
||||
|
||||
* `@Slow` - tests that are know to take a long time to execute
|
||||
* `@Nightly` - tests that only run in nightly builds (disabled by default)
|
||||
* `@Integration` - integration tests
|
||||
* `@Backwards` - backwards compatibility tests (disabled by default)
|
||||
* `@AwaitsFix` - tests that are waiting for a bugfix (disabled by default)
|
||||
* `@BadApple` - tests that are known to fail randomly (disabled by default)
|
||||
|
||||
Those annotation names can be combined into a filter expression like:
|
||||
|
||||
------------------------------------------------
|
||||
mvn test -Dtests.filter="@nightly and not @slow"
|
||||
------------------------------------------------
|
||||
|
||||
to run all nightly test but not the ones that are slow. `tests.filter` supports
|
||||
the boolean operators `and, or, not` and grouping ie:
|
||||
|
||||
|
||||
---------------------------------------------------------------
|
||||
mvn test -Dtests.filter="@nightly and not(@slow or @backwards)"
|
||||
---------------------------------------------------------------
|
||||
|
||||
=== Seed and repetitions.
|
||||
|
||||
Run with a given seed (seed is a hex-encoded long).
|
||||
|
@ -184,14 +207,23 @@ To run backwards compatibiilty tests untar or unzip a release and run the tests
|
|||
with the following command:
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
mvn test -Dtests.bwc=true -Dtests.bwc.version=x.y.z -Dtests.bwc.path=/path/to/elasticsearch
|
||||
mvn test -Dtests.filter="@backwards" -Dtests.bwc.version=x.y.z -Dtests.bwc.path=/path/to/elasticsearch
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
If the elasticsearch release is placed under `./backwards/elasticsearch-x.y.z` the path
|
||||
can be omitted:
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
mvn test -Dtests.bwc=true -Dtests.bwc.version=x.y.z
|
||||
mvn test -Dtests.filter="@backwards" -Dtests.bwc.version=x.y.z
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
To setup the bwc test environment execute the following steps (provided you are
|
||||
already in your elasticsearch clone):
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
$ mkdir backwards && cd backwards
|
||||
$ curl -O https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-1.2.1.tar.gz
|
||||
$ tar -xzf elasticsearch-1.2.1.tar.gz
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
== Testing the REST layer
|
||||
|
|
|
@ -62,9 +62,14 @@ REM The path to the heap dump location, note directory must exists and have enou
|
|||
REM space for a full heap dump.
|
||||
REM JAVA_OPTS=%JAVA_OPTS% -XX:HeapDumpPath=$ES_HOME/logs/heapdump.hprof
|
||||
|
||||
REM Disables explicit GC
|
||||
set JAVA_OPTS=%JAVA_OPTS% -XX:+DisableExplicitGC
|
||||
|
||||
set ES_CLASSPATH=%ES_CLASSPATH%;%ES_HOME%/lib/${project.build.finalName}.jar;%ES_HOME%/lib/*;%ES_HOME%/lib/sigar/*
|
||||
set ES_PARAMS=-Delasticsearch -Des-foreground=yes -Des.path.home="%ES_HOME%"
|
||||
|
||||
TITLE Elasticsearch ${project.version}
|
||||
|
||||
"%JAVA_HOME%\bin\java" %JAVA_OPTS% %ES_JAVA_OPTS% %ES_PARAMS% %* -cp "%ES_CLASSPATH%" "org.elasticsearch.bootstrap.Elasticsearch"
|
||||
goto finally
|
||||
|
||||
|
|
|
@ -62,3 +62,6 @@ JAVA_OPTS="$JAVA_OPTS -XX:+HeapDumpOnOutOfMemoryError"
|
|||
# The path to the heap dump location, note directory must exists and have enough
|
||||
# space for a full heap dump.
|
||||
#JAVA_OPTS="$JAVA_OPTS -XX:HeapDumpPath=$ES_HOME/logs/heapdump.hprof"
|
||||
|
||||
# Disables explicit GC
|
||||
JAVA_OPTS="$JAVA_OPTS -XX:+DisableExplicitGC"
|
||||
|
|
|
@ -45,5 +45,5 @@ while [ $# -gt 0 ]; do
|
|||
shift
|
||||
done
|
||||
|
||||
exec $JAVA $JAVA_OPTS -Xmx64m -Xms16m -Delasticsearch -Des.path.home="$ES_HOME" $properties -cp "$ES_HOME/lib/*" org.elasticsearch.plugins.PluginManager $args
|
||||
exec "$JAVA" $JAVA_OPTS -Xmx64m -Xms16m -Delasticsearch -Des.path.home="$ES_HOME" $properties -cp "$ES_HOME/lib/*" org.elasticsearch.plugins.PluginManager $args
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ if NOT DEFINED JAVA_HOME goto err
|
|||
set SCRIPT_DIR=%~dp0
|
||||
for %%I in ("%SCRIPT_DIR%..") do set ES_HOME=%%~dpfI
|
||||
|
||||
TITLE Elasticsearch Plugin Manager ${project.version}
|
||||
|
||||
"%JAVA_HOME%\bin\java" %JAVA_OPTS% -Xmx64m -Xms16m -Des.path.home="%ES_HOME%" -cp "%ES_HOME%/lib/*;" "org.elasticsearch.plugins.PluginManager" %*
|
||||
goto finally
|
||||
|
|
|
@ -43,6 +43,8 @@ set SERVICE_ID=%1
|
|||
|
||||
if "%LOG_OPTS%" == "" set LOG_OPTS=--LogPath "%LOG_DIR%" --LogPrefix "%SERVICE_ID%" --StdError auto --StdOutput auto
|
||||
|
||||
TITLE Elasticsearch Service ${project.version}
|
||||
|
||||
if /i %SERVICE_CMD% == install goto doInstall
|
||||
if /i %SERVICE_CMD% == remove goto doRemove
|
||||
if /i %SERVICE_CMD% == start goto doStart
|
||||
|
@ -160,6 +162,9 @@ REM The path to the heap dump location, note directory must exists and have enou
|
|||
REM space for a full heap dump.
|
||||
REM JAVA_OPTS=%JAVA_OPTS% -XX:HeapDumpPath=$ES_HOME/logs/heapdump.hprof
|
||||
|
||||
REM Disables explicit GC
|
||||
set JAVA_OPTS=%JAVA_OPTS% -XX:+DisableExplicitGC
|
||||
|
||||
if "%DATA_DIR%" == "" set DATA_DIR=%ES_HOME%\data
|
||||
|
||||
if "%WORK_DIR%" == "" set WORK_DIR=%ES_HOME%
|
||||
|
|
|
@ -375,3 +375,11 @@
|
|||
#monitor.jvm.gc.old.warn: 10s
|
||||
#monitor.jvm.gc.old.info: 5s
|
||||
#monitor.jvm.gc.old.debug: 2s
|
||||
|
||||
################################## Security ################################
|
||||
|
||||
# Uncomment if you want to enable JSONP as a valid return transport on the
|
||||
# http server. With this enabled, it may pose a security risk, so disabling
|
||||
# it unless you need it is recommended (it is disabled by default).
|
||||
#
|
||||
#http.jsonp.enable: true
|
||||
|
|
|
@ -18,8 +18,6 @@ java.util.Collections#sort(java.util.List,java.util.Comparator)
|
|||
|
||||
java.io.StringReader#<init>(java.lang.String) @ Use FastStringReader instead
|
||||
|
||||
org.apache.lucene.util.RamUsageEstimator#sizeOf(java.lang.Object) @ This can be a perfromance trap
|
||||
|
||||
@defaultMessage Reference management is tricky, leave it to SearcherManager
|
||||
org.apache.lucene.index.IndexReader#decRef()
|
||||
org.apache.lucene.index.IndexReader#incRef()
|
||||
|
@ -53,11 +51,21 @@ java.lang.Object#notifyAll()
|
|||
java.lang.Math#abs(int)
|
||||
java.lang.Math#abs(long)
|
||||
|
||||
@defaultMessage Please do not try to stop the world
|
||||
java.lang.System#gc()
|
||||
|
||||
@defaultMessage Use Long.compare instead we are on Java7
|
||||
com.google.common.primitives.Longs#compare(long,long)
|
||||
|
||||
@defaultMessage we have an optimized XStringField to reduce analysis creation overhead
|
||||
org.apache.lucene.document.Field#<init>(java.lang.String,java.lang.String,org.apache.lucene.document.FieldType)
|
||||
@defaultMessage Use Channels.* methods to write to channels. Do not write directly.
|
||||
java.nio.channels.WritableByteChannel#write(java.nio.ByteBuffer)
|
||||
java.nio.channels.FileChannel#write(java.nio.ByteBuffer, long)
|
||||
java.nio.channels.GatheringByteChannel#write(java.nio.ByteBuffer[], int, int)
|
||||
java.nio.channels.GatheringByteChannel#write(java.nio.ByteBuffer[])
|
||||
java.nio.channels.ReadableByteChannel#read(java.nio.ByteBuffer)
|
||||
java.nio.channels.ScatteringByteChannel#read(java.nio.ByteBuffer[])
|
||||
java.nio.channels.ScatteringByteChannel#read(java.nio.ByteBuffer[], int, int)
|
||||
java.nio.channels.FileChannel#read(java.nio.ByteBuffer, long)
|
||||
|
||||
@defaultMessage Use XNativeFSLockFactory instead of the buggy NativeFSLockFactory see LUCENE-5738 - remove once Lucene 4.9 is released
|
||||
org.apache.lucene.store.NativeFSLockFactory
|
||||
@defaultMessage Use Lucene.parseLenient instead it strips off minor version
|
||||
org.apache.lucene.util.Version#parseLeniently(java.lang.String)
|
||||
|
|
|
@ -10,16 +10,129 @@
|
|||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on
|
||||
# software distributed under the License is distributed on
|
||||
# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
# either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License
|
||||
|
||||
#
|
||||
# generate property file for the jdk randomization test
|
||||
#
|
||||
# NAME
|
||||
# build_randomization.rb -- Generate property file for the JDK randomization test
|
||||
#
|
||||
# SYNOPSIS
|
||||
# build_randomization.rb [-d] [-l|t]
|
||||
#
|
||||
# DESCRIPTION
|
||||
# This script takes the randomization choices described in RANDOM_CHOICE and generates apporpriate JAVA property file 'prop.txt'
|
||||
# This property file also contain the appropriate JDK selection, randomized. JDK randomization is based on what is available on the Jenkins tools
|
||||
# directory. This script is used by Jenkins test system to conduct Elasticsearch server randomization testing.
|
||||
#
|
||||
# In hash RANDOM_CHOISES, the key of randomization hash maps to key of java property. The value of the hash describes the possible value of the randomization
|
||||
#
|
||||
# For example RANDOM_CHOICES = { 'es.node.mode' => {:choices => ['local', 'network'], :method => :get_random_one} } means
|
||||
# es.node.mode will be set to either 'local' or 'network', each with 50% of probability
|
||||
#
|
||||
# OPTIONS SUMMARY
|
||||
# The options are as follows:
|
||||
#
|
||||
# -d, --debug Increase logging verbosity for debugging purpose
|
||||
# -t, --test Run in test mode. The script will execute unit tests.
|
||||
# -l, --local Run in local mode. In this mode, directory structure will be created under current directory to mimick
|
||||
# Jenkins' server directory layout. This mode is mainly used for development.
|
||||
require 'enumerator'
|
||||
require 'getoptlong'
|
||||
require 'log4r'
|
||||
require 'optparse'
|
||||
require 'rubygems'
|
||||
require 'yaml'
|
||||
include Log4r
|
||||
|
||||
RANDOM_CHOICES = {
|
||||
'tests.jvm.argline' => [
|
||||
{:choices => ['-server'], :method => 'get_random_one'},
|
||||
{:choices => ['-XX:+UseConcMarkSweepGC', '-XX:+UseParallelGC', '-XX:+UseSerialGC', '-XX:+UseG1GC'], :method => 'get_random_one'},
|
||||
{:choices => ['-XX:+UseCompressedOops', '-XX:-UseCompressedOops'], :method => 'get_random_one'}
|
||||
],
|
||||
|
||||
'es.node.mode' => {:choices => ['local', 'network'], :method => 'get_random_one'},
|
||||
|
||||
# bug forced to be false for now :test_nightly => { :method => :true_or_false},
|
||||
'tests.nightly' => {:selections => false},
|
||||
'tests.assertion.disabled'=> {:choices => 'org.elasticsearch', :method => 'get_10_percent'},
|
||||
'tests.security.manager' => {:choices => [true, false], :method => 'get_90_percent'},
|
||||
}
|
||||
|
||||
L = Logger.new 'test_randomizer'
|
||||
L.outputters = Outputter.stdout
|
||||
L.level = INFO
|
||||
C = {:local => false, :test => false}
|
||||
|
||||
|
||||
OptionParser.new do |opts|
|
||||
opts.banner = "Usage: build_ranodimzatin.rb [options]"
|
||||
|
||||
opts.on("-d", "--debug", "Debug mode") do |d|
|
||||
L.level = DEBUG
|
||||
end
|
||||
|
||||
opts.on("-l", "--local", "Run in local mode") do |l|
|
||||
C[:local] = true
|
||||
end
|
||||
|
||||
opts.on("-t", "--test", "Run unit tests") do |t|
|
||||
C[:test] = true
|
||||
end
|
||||
end.parse!
|
||||
|
||||
class Randomizer
|
||||
attr_accessor :data_array
|
||||
|
||||
def initialize(data_array)
|
||||
@data_array = data_array
|
||||
end
|
||||
|
||||
def true_or_false
|
||||
[true, false][rand(2)]
|
||||
end
|
||||
|
||||
def get_random_with_distribution(mdata_array, distribution)
|
||||
L.debug "randomized distribution data %s" % YAML.dump(mdata_array)
|
||||
L.debug "randomized distribution distribution %s" % YAML.dump(distribution)
|
||||
carry = 0
|
||||
distribution_map = distribution.enum_for(:each_with_index).map { |x,i| pre_carry = carry ; carry += x; {i => x + pre_carry} }
|
||||
|
||||
random_size = distribution_map.last.values.first
|
||||
selection = rand(random_size)
|
||||
#get the index that randomize choice mapped to
|
||||
choice = distribution_map.select do |x|
|
||||
x.values.first > selection #only keep the index with distribution value that is higher than the random generated number
|
||||
end.first.keys.first #first hash's first key is the index we want
|
||||
|
||||
L.debug("randomized distribution choice %s" % mdata_array[choice])
|
||||
mdata_array[choice]
|
||||
end
|
||||
|
||||
def get_random_one
|
||||
data_array[rand(data_array.size)]
|
||||
end
|
||||
|
||||
def method_missing(meth, *args, &block)
|
||||
# trap randomization based on percentage
|
||||
if meth.to_s =~ /^get_(\d+)_percent/
|
||||
percentage = $1.to_i
|
||||
remain = 100 - percentage
|
||||
#data = args.first
|
||||
normalized_data = if(!data_array.kind_of?(Array))
|
||||
[data_array, nil]
|
||||
else
|
||||
data_array
|
||||
end
|
||||
get_random_with_distribution(normalized_data, [percentage, remain])
|
||||
else
|
||||
super
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
class JDKSelector
|
||||
attr_reader :directory, :jdk_list
|
||||
|
@ -28,91 +141,333 @@ class JDKSelector
|
|||
@directory = directory
|
||||
end
|
||||
|
||||
# get selection of available jdks from jenkins automatic install directory
|
||||
# get selection of available JDKs from Jenkins automatic install directory
|
||||
def get_jdk
|
||||
@jdk_list = Dir.entries(directory).select do |x|
|
||||
x.chars.first == 'J'
|
||||
@jdk_list = Dir.entries(directory).select do |x|
|
||||
x.chars.first == 'J'
|
||||
end.map do |y|
|
||||
File.join(directory, y)
|
||||
end
|
||||
self
|
||||
end
|
||||
|
||||
# do ranomize selection from a given array
|
||||
def filter_java_6(files)
|
||||
files.select{ |i| File.basename(i).split(/[^0-9]/)[-1].to_i > 6 }
|
||||
end
|
||||
|
||||
# do randomized selection from a given array
|
||||
def select_one(selection_array = nil)
|
||||
selection_array = filter_java_6(selection_array || @jdk_list)
|
||||
selection_array[rand(selection_array.size)]
|
||||
|
||||
get_random_one(selection_array)
|
||||
Randomizer.new(selection_array).get_random_one
|
||||
end
|
||||
end
|
||||
|
||||
def get_random_one(data_array)
|
||||
data_array[rand(data_array.size)]
|
||||
end
|
||||
|
||||
def filter_java_6(files)
|
||||
files.select{ |i| File.basename(i).split(/[^0-9]/)[-1].to_i > 6 }
|
||||
end
|
||||
|
||||
# given a jdk directory selection, generate relevant environment variables
|
||||
def get_env_matrix(data_array)
|
||||
|
||||
#refactoring target
|
||||
es_test_jvm_option1 = get_random_one(['-server']) #only server for now get_random_one(['-client', '-server'])
|
||||
es_test_jvm_option2 = get_random_one(['-XX:+UseConcMarkSweepGC', '-XX:+UseParallelGC', '-XX:+UseSerialGC', '-XX:+UseG1GC'])
|
||||
|
||||
es_test_jvm_option3 = get_random_one(['-XX:+UseCompressedOops', '-XX:-UseCompressedOops'])
|
||||
es_node_mode = get_random_one(['local', 'network'])
|
||||
tests_nightly = get_random_one([true, false])
|
||||
tests_nightly = get_random_one([false]) #bug
|
||||
|
||||
test_assert_off = (rand(10) == 9) #10 percent chance turning it off
|
||||
tests_security_manager = (rand(10) != 9) #10 percent chance running without security manager
|
||||
arg_line = [es_test_jvm_option1, es_test_jvm_option2, es_test_jvm_option3]
|
||||
[*data_array].map do |x|
|
||||
data_hash = {
|
||||
'PATH' => File.join(x,'bin') + ':' + ENV['PATH'],
|
||||
'JAVA_HOME' => x,
|
||||
'BUILD_DESC' => "%s,%s,%s%s,%s %s%s%s"%[File.basename(x), es_node_mode, tests_nightly ? 'nightly,':'',
|
||||
es_test_jvm_option1[1..-1], es_test_jvm_option2[4..-1], es_test_jvm_option3[4..-1],
|
||||
test_assert_off ? ',assert off' : '', tests_security_manager ? ', security manager enabled' : ''],
|
||||
'es.node.mode' => es_node_mode,
|
||||
'tests.nightly' => tests_nightly,
|
||||
'tests.security.manager' => tests_security_manager,
|
||||
'tests.jvm.argline' => arg_line.join(" "),
|
||||
def JDKSelector.generate_jdk_hash(jdk_choice)
|
||||
file_separator = if Gem.win_platform?
|
||||
File::ALT_SEPARATOR
|
||||
else
|
||||
File::SEPARATOR
|
||||
end
|
||||
{
|
||||
:PATH => [jdk_choice, 'bin'].join(file_separator) + File::PATH_SEPARATOR + ENV['PATH'],
|
||||
:JAVA_HOME => jdk_choice
|
||||
}
|
||||
data_hash['tests.assertion.disabled'] = 'org.elasticsearch' if test_assert_off
|
||||
data_hash
|
||||
end
|
||||
end
|
||||
|
||||
# pick first element out of array of hashes, generate write java property file
|
||||
def generate_property_file(directory, data)
|
||||
#array transformation
|
||||
content = data.first.map do |key, value|
|
||||
"%s=%s"%[key, value]
|
||||
#
|
||||
# Fix argument JDK selector
|
||||
#
|
||||
class FixedJDKSelector < JDKSelector
|
||||
def initialize(directory)
|
||||
@directory = [*directory] #selection of directories to pick from
|
||||
end
|
||||
file_name = (ENV['BUILD_ID'] + ENV['BUILD_NUMBER']) || 'prop' rescue 'prop'
|
||||
file_name = file_name.split(File::SEPARATOR).first + '.txt'
|
||||
File.open(File.join(directory, file_name), 'w') do |file|
|
||||
file.write(content.join("\n"))
|
||||
|
||||
def get_jdk
|
||||
#since JDK selection is already specified..jdk list is the @directory
|
||||
@jdk_list = @directory
|
||||
self
|
||||
end
|
||||
|
||||
def select_one(selection_array = nil)
|
||||
#bypass filtering since this is not automatic
|
||||
selection_array ||= @jdk_list
|
||||
Randomizer.new(selection_array).get_random_one
|
||||
end
|
||||
end
|
||||
|
||||
working_directory = ENV['WORKSPACE'] || '/var/tmp'
|
||||
unless(ENV['BUILD_ID'])
|
||||
#local mode set up fake environment
|
||||
test_directory = 'tools/hudson.model.JDK/'
|
||||
unless(File.exist?(test_directory))
|
||||
puts "running local mode, setting up running environment"
|
||||
puts "properties are written to file prop.txt"
|
||||
system("mkdir -p %sJDK{6,7}"%test_directory)
|
||||
end
|
||||
working_directory = ENV['PWD']
|
||||
end
|
||||
# jenkins sets pwd prior to execution
|
||||
jdk_selector = JDKSelector.new(File.join(ENV['PWD'],'tools','hudson.model.JDK'))
|
||||
environment_matrix = get_env_matrix(jdk_selector.get_jdk.select_one)
|
||||
#
|
||||
# Property file writer
|
||||
#
|
||||
class PropertyWriter
|
||||
attr_reader :working_directory
|
||||
|
||||
generate_property_file(working_directory, environment_matrix)
|
||||
def initialize(mworking_directory)
|
||||
@working_directory = mworking_directory
|
||||
end
|
||||
|
||||
# # pick first element out of array of hashes, generate write java property file
|
||||
def generate_property_file(data)
|
||||
directory = working_directory
|
||||
|
||||
#array transformation
|
||||
content = data.to_a.map do |x|
|
||||
x.join('=')
|
||||
end.sort
|
||||
file_name = (ENV['BUILD_ID'] + ENV['BUILD_NUMBER']) || 'prop' rescue 'prop'
|
||||
file_name = file_name.split(File::SEPARATOR).first + '.txt'
|
||||
L.debug "Property file name is %s" % file_name
|
||||
File.open(File.join(directory, file_name), 'w') do |file|
|
||||
file.write(content.join("\n"))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Execute randomization logics
|
||||
#
|
||||
class RandomizedRunner
|
||||
attr_reader :random_choices, :jdk, :p_writer
|
||||
|
||||
def initialize(mrandom_choices, mjdk, mwriter)
|
||||
@random_choices = mrandom_choices
|
||||
@jdk = mjdk
|
||||
@p_writer = mwriter
|
||||
end
|
||||
|
||||
def generate_selections
|
||||
configuration = random_choices
|
||||
|
||||
L.debug "Enter %s" % __method__
|
||||
L.debug "Configuration %s" % YAML.dump(configuration)
|
||||
|
||||
generated = {}
|
||||
configuration.each do |k, v|
|
||||
if(v.kind_of?(Hash))
|
||||
if(v.has_key?(:method))
|
||||
randomizer = Randomizer.new(v[:choices])
|
||||
v[:selections] = randomizer.__send__(v[:method])
|
||||
end
|
||||
else
|
||||
v.each do |x|
|
||||
if(x.has_key?(:method))
|
||||
randomizer = Randomizer.new(x[:choices])
|
||||
x[:selections] = randomizer.__send__(x[:method])
|
||||
end
|
||||
end
|
||||
end
|
||||
end.each do |k, v|
|
||||
if(v.kind_of?(Array))
|
||||
selections = v.inject([]) do |sum, current_hash|
|
||||
sum.push(current_hash[:selections])
|
||||
end
|
||||
else
|
||||
selections = [v[:selections]] unless v[:selections].nil?
|
||||
end
|
||||
generated[k] = selections unless (selections.nil? || selections.size == 0)
|
||||
end
|
||||
|
||||
L.debug "Generated selections %s" % YAML.dump(generated)
|
||||
generated
|
||||
end
|
||||
|
||||
def get_env_matrix(jdk_selection, selections)
|
||||
L.debug "Enter %s" % __method__
|
||||
|
||||
#normalization
|
||||
s = {}
|
||||
selections.each do |k, v|
|
||||
if(v.size > 1)
|
||||
s[k] = v.join(' ') #this should be dependent on class of v[0] and perform reduce operation instead... good enough for now
|
||||
else
|
||||
s[k] = v.first
|
||||
end
|
||||
end
|
||||
j = JDKSelector.generate_jdk_hash(jdk_selection)
|
||||
|
||||
# create build description line
|
||||
desc = {}
|
||||
|
||||
# TODO: better error handling
|
||||
desc[:BUILD_DESC] = "%s,%s,%s%s%s%s" % [
|
||||
File.basename(j[:JAVA_HOME]),
|
||||
s['es.node.mode'],
|
||||
s['tests.nightly'] ? 'nightly,':'',
|
||||
s['tests.jvm.argline'].gsub(/-XX:/,''),
|
||||
s.has_key?('tests.assertion.disabled')? ',assert off' : '',
|
||||
s['tests.security.manager'] ? ',sec manager on' : ''
|
||||
]
|
||||
result = j.merge(s).merge(desc)
|
||||
L.debug(YAML.dump(result))
|
||||
result
|
||||
end
|
||||
|
||||
def run!
|
||||
p_writer.generate_property_file(get_env_matrix(jdk, generate_selections))
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
|
||||
#
|
||||
# Main
|
||||
#
|
||||
unless(C[:test])
|
||||
|
||||
# Check to see if this is running locally
|
||||
unless(C[:local])
|
||||
L.debug("Normal Mode")
|
||||
working_directory = ENV.fetch('WORKSPACE', (Gem.win_platform? ? Dir.pwd : '/var/tmp'))
|
||||
else
|
||||
L.debug("Local Mode")
|
||||
test_directory = 'tools/hudson.model.JDK/'
|
||||
unless(File.exist?(test_directory))
|
||||
L.info "running local mode, setting up running environment"
|
||||
L.info "properties are written to file prop.txt"
|
||||
FileUtils.mkpath "%sJDK6" % test_directory
|
||||
FileUtils.mkpath "%sJDK7" % test_directory
|
||||
end
|
||||
working_directory = Dir.pwd
|
||||
end
|
||||
|
||||
|
||||
# script support both window and linux
|
||||
# TODO: refactor into platform/machine dependent class structure
|
||||
jdk = if(Gem.win_platform?)
|
||||
#window mode jdk directories are fixed
|
||||
#TODO: better logic
|
||||
L.debug("Window Mode")
|
||||
if(File.directory?('y:\jdk7\7u55')) #old window system under ec2
|
||||
FixedJDKSelector.new('y:\jdk7\7u55')
|
||||
else #new metal window system
|
||||
FixedJDKSelector.new(['c:\PROGRA~1\JAVA\jdk1.8.0_05', 'c:\PROGRA~1\JAVA\jdk1.7.0_55'])
|
||||
end
|
||||
else
|
||||
#Jenkins sets pwd prior to execution
|
||||
L.debug("Linux Mode")
|
||||
JDKSelector.new(File.join(ENV['PWD'],'tools','hudson.model.JDK'))
|
||||
end
|
||||
|
||||
runner = RandomizedRunner.new(RANDOM_CHOICES,
|
||||
jdk.get_jdk.select_one,
|
||||
PropertyWriter.new(working_directory))
|
||||
environment_matrix = runner.run!
|
||||
exit 0
|
||||
else
|
||||
require "test/unit"
|
||||
end
|
||||
|
||||
#
|
||||
# Test
|
||||
#
|
||||
class TestJDKSelector < Test::Unit::TestCase
|
||||
L = Logger.new 'test'
|
||||
L.outputters = Outputter.stdout
|
||||
L.level = DEBUG
|
||||
|
||||
def test_hash_generator
|
||||
jdk_choice = '/dummy/jdk7'
|
||||
generated = JDKSelector.generate_jdk_hash(jdk_choice)
|
||||
L.debug "Generated %s" % generated
|
||||
assert generated[:PATH].include?(jdk_choice), "PATH doesn't included choice"
|
||||
assert generated[:JAVA_HOME].include?(jdk_choice), "JAVA home doesn't include choice"
|
||||
end
|
||||
end
|
||||
|
||||
class TestFixJDKSelector < Test::Unit::TestCase
|
||||
L = Logger.new 'test'
|
||||
L.outputters = Outputter.stdout
|
||||
L.level = DEBUG
|
||||
|
||||
def test_initialize
|
||||
['/home/dummy', ['/JDK7', '/home2'], ['home/dummy']].each do |x|
|
||||
test_object = FixedJDKSelector.new(x)
|
||||
assert_kind_of Array, test_object.directory
|
||||
assert_equal [*x], test_object.directory
|
||||
end
|
||||
end
|
||||
|
||||
def test_select_one
|
||||
test_array = %w(one two three)
|
||||
test_object = FixedJDKSelector.new(test_array)
|
||||
assert test_array.include?(test_object.get_jdk.select_one)
|
||||
end
|
||||
|
||||
def test_hash_generator
|
||||
jdk_choice = '/dummy/jdk7'
|
||||
generated = FixedJDKSelector.generate_jdk_hash(jdk_choice)
|
||||
L.debug "Generated %s" % generated
|
||||
assert generated[:PATH].include?(jdk_choice), "PATH doesn't included choice"
|
||||
assert generated[:JAVA_HOME].include?(jdk_choice), "JAVA home doesn't include choice"
|
||||
end
|
||||
end
|
||||
|
||||
class TestPropertyWriter < Test::Unit::TestCase
|
||||
L = Logger.new 'test'
|
||||
L.outputters = Outputter.stdout
|
||||
L.level = DEBUG
|
||||
|
||||
def test_initialize
|
||||
['/home/dummy','/tmp'].each do |x|
|
||||
test_object = PropertyWriter.new(x)
|
||||
assert_kind_of String, test_object.working_directory
|
||||
assert_equal x, test_object.working_directory
|
||||
end
|
||||
end
|
||||
|
||||
def test_generate_property
|
||||
test_file = '/tmp/prop.txt'
|
||||
File.delete(test_file) if File.exist?(test_file)
|
||||
test_object = PropertyWriter.new(File.dirname(test_file))
|
||||
# default prop.txt
|
||||
test_object.generate_property_file({:hi => 'there'})
|
||||
assert(File.exist?(test_file))
|
||||
|
||||
File.open(test_file, 'r') do |properties_file|
|
||||
properties_file.read.each_line do |line|
|
||||
line.strip!
|
||||
assert_equal 'hi=there', line, "content %s is not hi=there" % line
|
||||
end
|
||||
end
|
||||
File.delete(test_file) if File.exist?(test_file)
|
||||
end
|
||||
end
|
||||
|
||||
class DummyPropertyWriter < PropertyWriter
|
||||
def generate_property_file(data)
|
||||
L.debug "generating property file for %s" % YAML.dump(data)
|
||||
L.debug "on directory %s" % working_directory
|
||||
end
|
||||
end
|
||||
|
||||
class TestRandomizedRunner < Test::Unit::TestCase
|
||||
|
||||
def test_initialize
|
||||
test_object = RandomizedRunner.new(RANDOM_CHOICES, '/tmp/dummy/jdk', po = PropertyWriter.new('/tmp'))
|
||||
assert_equal RANDOM_CHOICES, test_object.random_choices
|
||||
assert_equal '/tmp/dummy/jdk', test_object.jdk
|
||||
assert_equal po, test_object.p_writer
|
||||
end
|
||||
|
||||
def test_generate_selection_no_method
|
||||
test_object = RandomizedRunner.new({'tests.one' => {:selections => false }}, '/tmp/dummy/jdk', po = DummyPropertyWriter.new('/tmp'))
|
||||
selection = test_object.generate_selections
|
||||
assert_equal false, selection['tests.one'].first, 'randomization without selection method fails'
|
||||
end
|
||||
|
||||
def test_generate_with_method
|
||||
test_object = RandomizedRunner.new({'es.node.mode' => {:choices => ['local', 'network'], :method => 'get_random_one'}},
|
||||
'/tmp/dummy/jdk', po = DummyPropertyWriter.new('/tmp'))
|
||||
selection = test_object.generate_selections
|
||||
assert ['local', 'network'].include?(selection['es.node.mode'].first), 'selection choice is not correct'
|
||||
end
|
||||
|
||||
def test_get_env_matrix
|
||||
test_object = RandomizedRunner.new(RANDOM_CHOICES,
|
||||
'/tmp/dummy/jdk', po = DummyPropertyWriter.new('/tmp'))
|
||||
selection = test_object.generate_selections
|
||||
env_matrix = test_object.get_env_matrix('/tmp/dummy/jdk', selection)
|
||||
puts YAML.dump(env_matrix)
|
||||
assert_equal '/tmp/dummy/jdk', env_matrix[:JAVA_HOME]
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -234,7 +234,7 @@ def run_mvn(*cmd):
|
|||
for c in cmd:
|
||||
run('%s; %s %s' % (java_exe(), MVN, c))
|
||||
|
||||
def build_release(run_tests=False, dry_run=True, cpus=1):
|
||||
def build_release(run_tests=False, dry_run=True, cpus=1, bwc_version=None):
|
||||
target = 'deploy'
|
||||
if dry_run:
|
||||
target = 'package'
|
||||
|
@ -242,6 +242,9 @@ def build_release(run_tests=False, dry_run=True, cpus=1):
|
|||
run_mvn('clean',
|
||||
'test -Dtests.jvms=%s -Des.node.mode=local' % (cpus),
|
||||
'test -Dtests.jvms=%s -Des.node.mode=network' % (cpus))
|
||||
if bwc_version:
|
||||
print('Running Backwards compatibilty tests against version [%s]' % (bwc_version))
|
||||
run_mvn('clean', 'test -Dtests.filter=@backwards -Dtests.bwc.version=%s -Dtests.bwc=true -Dtests.jvms=1' % bwc_version)
|
||||
run_mvn('clean test-compile -Dforbidden.test.signatures="org.apache.lucene.util.LuceneTestCase\$AwaitsFix @ Please fix all bugs before release"')
|
||||
run_mvn('clean %s -DskipTests' % (target))
|
||||
success = False
|
||||
|
@ -345,7 +348,7 @@ def generate_checksums(files):
|
|||
directory = os.path.dirname(release_file)
|
||||
file = os.path.basename(release_file)
|
||||
checksum_file = '%s.sha1.txt' % file
|
||||
|
||||
|
||||
if os.system('cd %s; shasum %s > %s' % (directory, file, checksum_file)):
|
||||
raise RuntimeError('Failed to generate checksum for file %s' % release_file)
|
||||
res = res + [os.path.join(directory, checksum_file), release_file]
|
||||
|
@ -379,12 +382,12 @@ def smoke_test_release(release, files, expected_hash, plugins):
|
|||
raise RuntimeError('Smoketest failed missing file %s' % (release_file))
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
if release_file.endswith('tar.gz'):
|
||||
run('tar -xzf %s -C %s' % (release_file, tmp_dir))
|
||||
run('tar -xzf %s -C %s' % (release_file, tmp_dir))
|
||||
elif release_file.endswith('zip'):
|
||||
run('unzip %s -d %s' % (release_file, tmp_dir))
|
||||
run('unzip %s -d %s' % (release_file, tmp_dir))
|
||||
else:
|
||||
log('Skip SmokeTest for [%s]' % release_file)
|
||||
continue # nothing to do here
|
||||
continue # nothing to do here
|
||||
es_run_path = os.path.join(tmp_dir, 'elasticsearch-%s' % (release), 'bin/elasticsearch')
|
||||
print(' Smoke testing package [%s]' % release_file)
|
||||
es_plugin_path = os.path.join(tmp_dir, 'elasticsearch-%s' % (release),'bin/plugin')
|
||||
|
@ -472,7 +475,7 @@ def print_sonartype_notice():
|
|||
for line in settings_file:
|
||||
if line.strip() == '<id>sonatype-nexus-snapshots</id>':
|
||||
# moving out - we found the indicator no need to print the warning
|
||||
return
|
||||
return
|
||||
print("""
|
||||
NOTE: No sonartype settings detected, make sure you have configured
|
||||
your sonartype credentials in '~/.m2/settings.xml':
|
||||
|
@ -499,12 +502,29 @@ def check_s3_credentials():
|
|||
if not env.get('AWS_ACCESS_KEY_ID', None) or not env.get('AWS_SECRET_ACCESS_KEY', None):
|
||||
raise RuntimeError('Could not find "AWS_ACCESS_KEY_ID" / "AWS_SECRET_ACCESS_KEY" in the env variables please export in order to upload to S3')
|
||||
|
||||
VERSION_FILE = 'src/main/java/org/elasticsearch/Version.java'
|
||||
VERSION_FILE = 'src/main/java/org/elasticsearch/Version.java'
|
||||
POM_FILE = 'pom.xml'
|
||||
|
||||
# we print a notice if we can not find the relevant infos in the ~/.m2/settings.xml
|
||||
# we print a notice if we can not find the relevant infos in the ~/.m2/settings.xml
|
||||
print_sonartype_notice()
|
||||
|
||||
# finds the highest available bwc version to test against
|
||||
def find_bwc_version(release_version, bwc_dir='backwards'):
|
||||
log(' Lookup bwc version in directory [%s]' % bwc_dir)
|
||||
bwc_version = None
|
||||
if os.path.exists(bwc_dir) and os.path.isdir(bwc_dir):
|
||||
max_version = [int(x) for x in release_version.split('.')]
|
||||
for dir in os.listdir(bwc_dir):
|
||||
if os.path.isdir(os.path.join(bwc_dir, dir)) and dir.startswith('elasticsearch-'):
|
||||
version = [int(x) for x in dir[len('elasticsearch-'):].split('.')]
|
||||
if version < max_version: # bwc tests only against smaller versions
|
||||
if (not bwc_version) or version > [int(x) for x in bwc_version.split('.')]:
|
||||
bwc_version = dir[len('elasticsearch-'):]
|
||||
log(' Using bwc version [%s]' % bwc_version)
|
||||
else:
|
||||
log(' bwc directory [%s] does not exists or is not a directory - skipping' % bwc_dir)
|
||||
return bwc_version
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Builds and publishes a Elasticsearch Release')
|
||||
parser.add_argument('--branch', '-b', metavar='master', default=get_current_branch(),
|
||||
|
@ -520,11 +540,13 @@ if __name__ == '__main__':
|
|||
help='Publishes the release. Disable by default.')
|
||||
parser.add_argument('--smoke', '-s', dest='smoke', default='',
|
||||
help='Smoke tests the given release')
|
||||
parser.add_argument('--bwc', '-w', dest='bwc', metavar='backwards', default='backwards',
|
||||
help='Backwards compatibility version path to use to run compatibility tests against')
|
||||
|
||||
parser.set_defaults(dryrun=True)
|
||||
parser.set_defaults(smoke=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
bwc_path = args.bwc
|
||||
src_branch = args.branch
|
||||
remote = args.remote
|
||||
run_tests = args.tests
|
||||
|
@ -534,7 +556,7 @@ if __name__ == '__main__':
|
|||
smoke_test_version = args.smoke
|
||||
if not dry_run:
|
||||
check_s3_credentials()
|
||||
print('WARNING: dryrun is set to "false" - this will push and publish the release')
|
||||
print('WARNING: dryrun is set to "false" - this will push and publish the release')
|
||||
input('Press Enter to continue...')
|
||||
|
||||
print(''.join(['-' for _ in range(80)]))
|
||||
|
@ -574,7 +596,7 @@ if __name__ == '__main__':
|
|||
print(' Running maven builds now and publish to sonartype - run-tests [%s]' % run_tests)
|
||||
else:
|
||||
print(' Running maven builds now run-tests [%s]' % run_tests)
|
||||
build_release(run_tests=run_tests, dry_run=dry_run, cpus=cpus)
|
||||
build_release(run_tests=run_tests, dry_run=dry_run, cpus=cpus, bwc_version=find_bwc_version(release_version, bwc_path))
|
||||
artifacts = get_artifacts(release_version)
|
||||
artifacts_and_checksum = generate_checksums(artifacts)
|
||||
smoke_test_release(release_version, artifacts, get_head_hash(), PLUGINS)
|
||||
|
|
|
@ -20,6 +20,7 @@ use warnings;
|
|||
|
||||
use HTTP::Tiny;
|
||||
use IO::Socket::SSL 1.52;
|
||||
use utf8;
|
||||
|
||||
my $Base_URL = 'https://api.github.com/repos/';
|
||||
my $User_Repo = 'elasticsearch/elasticsearch/';
|
||||
|
@ -85,6 +86,9 @@ sub dump_issues {
|
|||
}
|
||||
for my $issue (@$header_issues) {
|
||||
my $title = $issue->{title};
|
||||
$title=~s{`([^`]+)`}{<code>$1</code>}g
|
||||
if $format eq 'html';
|
||||
|
||||
if ( $issue->{state} eq 'open' ) {
|
||||
$title .= " [OPEN]";
|
||||
}
|
||||
|
|
|
@ -27,10 +27,11 @@ grant {
|
|||
permission java.io.FilePermission "${junit4.childvm.cwd}", "read,execute,write";
|
||||
permission java.io.FilePermission "${junit4.childvm.cwd}${/}-", "read,execute,write,delete";
|
||||
permission java.io.FilePermission "${junit4.tempDir}${/}*", "read,execute,write,delete";
|
||||
|
||||
permission groovy.security.GroovyCodeSourcePermission "/groovy/script";
|
||||
|
||||
// Allow connecting to the internet anywhere
|
||||
permission java.net.SocketPermission "*", "accept,listen,connect,resolve";
|
||||
|
||||
|
||||
// Basic permissions needed for Lucene / Elasticsearch to work:
|
||||
permission java.util.PropertyPermission "*", "read,write";
|
||||
permission java.lang.reflect.ReflectPermission "*";
|
||||
|
|
|
@ -0,0 +1,321 @@
|
|||
# Licensed to Elasticsearch under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on
|
||||
# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
# either express or implied. See the License for the specific
|
||||
# language governing permissions and limitations under the License.
|
||||
|
||||
import random
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import re
|
||||
|
||||
from datetime import datetime
|
||||
try:
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch.exceptions import ConnectionError
|
||||
from elasticsearch.exceptions import TransportError
|
||||
except ImportError as e:
|
||||
print('Can\'t import elasticsearch please install `sudo pip install elasticsearch`')
|
||||
raise e
|
||||
|
||||
|
||||
'''This file executes a basic upgrade test by running a full cluster restart.
|
||||
|
||||
The upgrade test starts 2 or more nodes of an old elasticserach version, indexes
|
||||
a random number of documents into the running nodes and executes a full cluster restart.
|
||||
After the nodes are recovered a small set of basic checks are executed to ensure all
|
||||
documents are still searchable and field data can be loaded etc.
|
||||
|
||||
NOTE: This script requires the elasticsearch python client `elasticsearch-py` run the following command to install:
|
||||
|
||||
`sudo pip install elasticsearch`
|
||||
|
||||
if you are running python3 you need to install the client using pip3. On OSX `pip3` will be included in the Python 3.4
|
||||
release available on `https://www.python.org/download/`:
|
||||
|
||||
`sudo pip3 install elasticsearch`
|
||||
|
||||
See `https://github.com/elasticsearch/elasticsearch-py` for details
|
||||
|
||||
In order to run this test two different version of elasticsearch are required. Both need to be unpacked into
|
||||
the same directory:
|
||||
|
||||
```
|
||||
$ cd /path/to/elasticsearch/clone
|
||||
$ mkdir backwards && cd backwards
|
||||
$ wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-1.3.1.tar.gz
|
||||
$ wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-0.90.13.tar.gz
|
||||
$ tar -zxvf elasticsearch-1.3.1.tar.gz && tar -zxvf elasticsearch-0.90.13.tar.gz
|
||||
$ cd ..
|
||||
$ python dev-tools/upgrade-tests.py --version.backwards 0.90.13 --version.current 1.3.1
|
||||
```
|
||||
'''
|
||||
|
||||
BLACK_LIST = {'1.2.0' : { 'reason': 'Contains a major bug where routing hashes are not consistent with previous version',
|
||||
'issue': 'https://github.com/elasticsearch/elasticsearch/pull/6393'},
|
||||
'1.3.0' : { 'reason': 'Lucene Related bug prevents upgrades from 0.90.7 and some earlier versions ',
|
||||
'issue' : 'https://github.com/elasticsearch/elasticsearch/pull/7055'}}
|
||||
# sometimes returns True
|
||||
def rarely():
|
||||
return random.randint(0, 10) == 0
|
||||
|
||||
# usually returns True
|
||||
def frequently():
|
||||
return not rarely()
|
||||
|
||||
# asserts the correctness of the given hits given they are sorted asc
|
||||
def assert_sort(hits):
|
||||
values = [hit['sort'] for hit in hits['hits']['hits']]
|
||||
assert len(values) > 0, 'expected non emtpy result'
|
||||
val = min(values)
|
||||
for x in values:
|
||||
assert x >= val, '%s >= %s' % (x, val)
|
||||
val = x
|
||||
|
||||
# asserts that the cluster health didn't timeout etc.
|
||||
def assert_health(cluster_health, num_shards, num_replicas):
|
||||
assert cluster_health['timed_out'] == False, 'cluster health timed out %s' % cluster_health
|
||||
|
||||
|
||||
# Starts a new elasticsearch node from a released & untared version.
|
||||
# This node uses unicast discovery with the provided unicast host list and starts
|
||||
# the nodes with the given data directory. This allows shutting down and starting up
|
||||
# nodes on the same data dir simulating a full cluster restart.
|
||||
def start_node(version, data_dir, node_dir, unicast_host_list, tcp_port, http_port):
|
||||
es_run_path = os.path.join(node_dir, 'elasticsearch-%s' % (version), 'bin/elasticsearch')
|
||||
if version.startswith('0.90.'):
|
||||
foreground = '-f' # 0.90.x starts in background automatically
|
||||
else:
|
||||
foreground = ''
|
||||
return subprocess.Popen([es_run_path,
|
||||
'-Des.path.data=%s' % data_dir, '-Des.cluster.name=upgrade_test',
|
||||
'-Des.discovery.zen.ping.unicast.hosts=%s' % unicast_host_list,
|
||||
'-Des.discovery.zen.ping.multicast.enabled=false',
|
||||
'-Des.script.disable_dynamic=true',
|
||||
'-Des.transport.tcp.port=%s' % tcp_port,
|
||||
'-Des.http.port=%s' % http_port,
|
||||
foreground], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
|
||||
|
||||
# Indexes the given number of document into the given index
|
||||
# and randomly runs refresh, optimize and flush commands
|
||||
def index_documents(es, index_name, type, num_docs):
|
||||
logging.info('Indexing %s docs' % num_docs)
|
||||
for id in range(0, num_docs):
|
||||
es.index(index=index_name, doc_type=type, id=id, body={'string': str(random.randint(0, 100)),
|
||||
'long_sort': random.randint(0, 100),
|
||||
'double_sort' : float(random.randint(0, 100))})
|
||||
if rarely():
|
||||
es.indices.refresh(index=index_name)
|
||||
if rarely():
|
||||
es.indices.flush(index=index_name, force=frequently())
|
||||
if rarely():
|
||||
es.indices.optimize(index=index_name)
|
||||
es.indices.refresh(index=index_name)
|
||||
|
||||
# Runs a basic number of assertions including:
|
||||
# - document counts
|
||||
# - match all search with sort on double / long
|
||||
# - Realtime GET operations
|
||||
# TODO(simonw): we should add stuff like:
|
||||
# - dates including sorting
|
||||
# - string sorting
|
||||
# - docvalues if available
|
||||
# - global ordinal if available
|
||||
def run_basic_asserts(es, index_name, type, num_docs):
|
||||
count = es.count(index=index_name)['count']
|
||||
assert count == num_docs, 'Expected %r but got %r documents' % (num_docs, count)
|
||||
for _ in range(0, num_docs):
|
||||
random_doc_id = random.randint(0, num_docs-1)
|
||||
doc = es.get(index=index_name, doc_type=type, id=random_doc_id)
|
||||
assert doc, 'Expected document for id %s but got %s' % (random_doc_id, doc)
|
||||
|
||||
assert_sort(es.search(index=index_name,
|
||||
body={
|
||||
'sort': [
|
||||
{'double_sort': {'order': 'asc'}}
|
||||
]
|
||||
}))
|
||||
|
||||
assert_sort(es.search(index=index_name,
|
||||
body={
|
||||
'sort': [
|
||||
{'long_sort': {'order': 'asc'}}
|
||||
]
|
||||
}))
|
||||
|
||||
|
||||
# picks a random version or and entire random version tuple from the directory
|
||||
# to run the backwards tests against.
|
||||
def pick_random_upgrade_version(directory, lower_version=None, upper_version=None):
|
||||
if lower_version and upper_version:
|
||||
return lower_version, upper_version
|
||||
assert os.path.isdir(directory), 'No such directory %s' % directory
|
||||
versions = []
|
||||
for version in map(lambda x : x[len('elasticsearch-'):], filter(lambda x : re.match(r'^elasticsearch-\d+[.]\d+[.]\d+$', x), os.listdir(directory))):
|
||||
if not version in BLACK_LIST:
|
||||
versions.append(build_tuple(version))
|
||||
versions.sort()
|
||||
|
||||
if lower_version: # lower version is set - picking a higher one
|
||||
versions = filter(lambda x : x > build_tuple(lower_version), versions)
|
||||
assert len(versions) >= 1, 'Expected at least 1 higher version than %s version in %s ' % (lower_version, directory)
|
||||
random.shuffle(versions)
|
||||
return lower_version, build_version(versions[0])
|
||||
if upper_version:
|
||||
versions = filter(lambda x : x < build_tuple(upper_version), versions)
|
||||
assert len(versions) >= 1, 'Expected at least 1 lower version than %s version in %s ' % (upper_version, directory)
|
||||
random.shuffle(versions)
|
||||
return build_version(versions[0]), upper_version
|
||||
assert len(versions) >= 2, 'Expected at least 2 different version in %s but found %s' % (directory, len(versions))
|
||||
random.shuffle(versions)
|
||||
versions = versions[0:2]
|
||||
versions.sort()
|
||||
return build_version(versions[0]), build_version(versions[1])
|
||||
|
||||
def build_version(version_tuple):
|
||||
return '.'.join([str(x) for x in version_tuple])
|
||||
|
||||
def build_tuple(version_string):
|
||||
return [int(x) for x in version_string.split('.')]
|
||||
|
||||
# returns a new elasticsearch client and ensures the all nodes have joined the cluster
|
||||
# this method waits at most 30 seconds for all nodes to join
|
||||
def new_es_instance(num_nodes, http_port, timeout = 30):
|
||||
logging.info('Waiting for %s nodes to join the cluster' % num_nodes)
|
||||
for _ in range(0, timeout):
|
||||
# TODO(simonw): ask Honza if there is a better way to do this?
|
||||
try:
|
||||
es = Elasticsearch([
|
||||
{'host': '127.0.0.1', 'port': http_port + x}
|
||||
for x in range(0, num_nodes)])
|
||||
es.cluster.health(wait_for_nodes=num_nodes)
|
||||
es.count() # can we actually search or do we get a 503? -- anyway retry
|
||||
return es
|
||||
except (ConnectionError, TransportError):
|
||||
pass
|
||||
time.sleep(1)
|
||||
assert False, 'Timed out waiting for %s nodes for %s seconds' % (num_nodes, timeout)
|
||||
|
||||
def assert_versions(bwc_version, current_version, node_dir):
|
||||
assert [int(x) for x in bwc_version.split('.')] < [int(x) for x in current_version.split('.')],\
|
||||
'[%s] must be < than [%s]' % (bwc_version, current_version)
|
||||
for version in [bwc_version, current_version]:
|
||||
assert not version in BLACK_LIST, 'Version %s is blacklisted - %s, see %s' \
|
||||
% (version, BLACK_LIST[version]['reason'],
|
||||
BLACK_LIST[version]['issue'])
|
||||
dir = os.path.join(node_dir, 'elasticsearch-%s' % current_version)
|
||||
assert os.path.isdir(dir), 'Expected elasticsearch-%s install directory does not exists: %s' % (version, dir)
|
||||
|
||||
def full_cluster_restart(node_dir, current_version, bwc_version, tcp_port, http_port):
|
||||
assert_versions(bwc_version, current_version, node_dir)
|
||||
num_nodes = random.randint(2, 3)
|
||||
nodes = []
|
||||
data_dir = tempfile.mkdtemp()
|
||||
logging.info('Running upgrade test from [%s] to [%s] seed: [%s] es.path.data: [%s] es.http.port [%s] es.tcp.port [%s]'
|
||||
% (bwc_version, current_version, seed, data_dir, http_port, tcp_port))
|
||||
try:
|
||||
logging.info('Starting %s BWC nodes of version %s' % (num_nodes, bwc_version))
|
||||
unicast_addresses = ','.join(['127.0.0.1:%s' % (tcp_port+x) for x in range(0, num_nodes)])
|
||||
for id in range(0, num_nodes):
|
||||
nodes.append(start_node(bwc_version, data_dir, node_dir, unicast_addresses, tcp_port+id, http_port+id))
|
||||
es = new_es_instance(num_nodes, http_port)
|
||||
es.indices.delete(index='test_index', ignore=404)
|
||||
num_shards = random.randint(1, 10)
|
||||
num_replicas = random.randint(0, 1)
|
||||
logging.info('Create index with [%s] shards and [%s] replicas' % (num_shards, num_replicas))
|
||||
es.indices.create(index='test_index', body={
|
||||
# TODO(simonw): can we do more here in terms of randomization - seems hard due to all the different version
|
||||
'settings': {
|
||||
'number_of_shards': num_shards,
|
||||
'number_of_replicas': num_replicas
|
||||
}
|
||||
})
|
||||
logging.info('Nodes joined, waiting for green status')
|
||||
health = es.cluster.health(wait_for_status='green', wait_for_relocating_shards=0)
|
||||
assert_health(health, num_shards, num_replicas)
|
||||
num_docs = random.randint(10, 100)
|
||||
index_documents(es, 'test_index', 'test_type', num_docs)
|
||||
logging.info('Run basic asserts before full cluster restart')
|
||||
run_basic_asserts(es, 'test_index', 'test_type', num_docs)
|
||||
logging.info('kill bwc nodes -- prepare upgrade')
|
||||
for node in nodes:
|
||||
node.terminate()
|
||||
|
||||
# now upgrade the nodes and rerun the checks
|
||||
tcp_port = tcp_port + len(nodes) # bump up port to make sure we can claim them
|
||||
http_port = http_port + len(nodes)
|
||||
logging.info('Full Cluster restart starts upgrading to version [elasticsearch-%s] es.http.port [%s] es.tcp.port [%s]'
|
||||
% (current_version, http_port, tcp_port))
|
||||
nodes = []
|
||||
unicast_addresses = ','.join(['127.0.0.1:%s' % (tcp_port+x) for x in range(0, num_nodes)])
|
||||
for id in range(0, num_nodes+1): # one more to trigger relocation
|
||||
nodes.append(start_node(current_version, data_dir, node_dir, unicast_addresses, tcp_port+id, http_port+id))
|
||||
es = new_es_instance(num_nodes+1, http_port)
|
||||
logging.info('Nodes joined, waiting for green status')
|
||||
health = es.cluster.health(wait_for_status='green', wait_for_relocating_shards=0)
|
||||
assert_health(health, num_shards, num_replicas)
|
||||
run_basic_asserts(es, 'test_index', 'test_type', num_docs)
|
||||
# by running the indexing again we try to catch possible mapping problems after the upgrade
|
||||
index_documents(es, 'test_index', 'test_type', num_docs)
|
||||
run_basic_asserts(es, 'test_index', 'test_type', num_docs)
|
||||
logging.info("[SUCCESS] - all test passed upgrading from version [%s] to version [%s]" % (bwc_version, current_version))
|
||||
finally:
|
||||
for node in nodes:
|
||||
node.terminate()
|
||||
time.sleep(1) # wait a second until removing the data dirs to give the nodes a chance to shutdown
|
||||
shutil.rmtree(data_dir) # remove the temp data dir
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(format='[%(levelname)s] [%(asctime)s] %(message)s', level=logging.INFO,
|
||||
datefmt='%Y-%m-%d %I:%M:%S %p')
|
||||
logging.getLogger('elasticsearch').setLevel(logging.ERROR)
|
||||
logging.getLogger('urllib3').setLevel(logging.WARN)
|
||||
parser = argparse.ArgumentParser(description='Tests Full Cluster Restarts across major version')
|
||||
parser.add_argument('--version.backwards', '-b', dest='backwards_version', metavar='V',
|
||||
help='The elasticsearch version to upgrade from')
|
||||
parser.add_argument('--version.current', '-c', dest='current_version', metavar='V',
|
||||
help='The elasticsearch version to upgrade to')
|
||||
parser.add_argument('--seed', '-s', dest='seed', metavar='N', type=int,
|
||||
help='The random seed to use')
|
||||
parser.add_argument('--backwards.dir', '-d', dest='bwc_directory', default='backwards', metavar='dir',
|
||||
help='The directory to the backwards compatibility sources')
|
||||
|
||||
parser.add_argument('--tcp.port', '-p', dest='tcp_port', default=9300, metavar='port', type=int,
|
||||
help='The port to use as the minimum port for TCP communication')
|
||||
parser.add_argument('--http.port', '-t', dest='http_port', default=9200, metavar='port', type=int,
|
||||
help='The port to use as the minimum port for HTTP communication')
|
||||
|
||||
parser.set_defaults(bwc_directory='backwards')
|
||||
parser.set_defaults(seed=int(time.time()))
|
||||
args = parser.parse_args()
|
||||
node_dir = args.bwc_directory
|
||||
current_version = args.current_version
|
||||
bwc_version = args.backwards_version
|
||||
seed = args.seed
|
||||
random.seed(seed)
|
||||
bwc_version, current_version = pick_random_upgrade_version(node_dir, bwc_version, current_version)
|
||||
tcp_port = args.tcp_port
|
||||
http_port = args.http_port
|
||||
try:
|
||||
full_cluster_restart(node_dir, current_version, bwc_version, tcp_port, http_port)
|
||||
except:
|
||||
logging.warn('REPRODUCE WITH: \n\t`python %s --version.backwards %s --version.current %s --seed %s --tcp.port %s --http.port %s`'
|
||||
% (sys.argv[0], bwc_version, current_version, seed, tcp_port, http_port))
|
||||
raise
|
|
@ -70,7 +70,7 @@ See the {client}/php-api/current/index.html[official Elasticsearch PHP client].
|
|||
|
||||
* https://github.com/searchbox-io/Jest[Jest]:
|
||||
Java Rest client.
|
||||
* There is of course the [native ES Java client](http://www.elasticsearch.org/guide/en/elasticsearch/client/java-api/current/index.html)
|
||||
* There is of course the http://www.elasticsearch.org/guide/en/elasticsearch/client/java-api/current/index.html[native ES Java client]
|
||||
|
||||
[[community-javascript]]
|
||||
=== JavaScript
|
||||
|
@ -90,14 +90,13 @@ See the {client}/javascript-api/current/index.html[official Elasticsearch JavaSc
|
|||
|
||||
|
||||
[[community-dotnet]]
|
||||
=== .Net
|
||||
=== .NET
|
||||
|
||||
See the {client}/net-api/current/index.html[official Elasticsearch .NET client].
|
||||
|
||||
* https://github.com/Yegoroff/PlainElastic.Net[PlainElastic.Net]:
|
||||
.NET client.
|
||||
|
||||
* https://github.com/Mpdreamz/NEST[NEST]:
|
||||
.NET client.
|
||||
|
||||
* https://github.com/medcl/ElasticSearch.Net[ElasticSearch.NET]:
|
||||
.NET client.
|
||||
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
[[front-ends]]
|
||||
== Front Ends
|
||||
|
||||
* https://chrome.google.com/webstore/detail/sense/doinijnbnggojdlcjifpdckfokbbfpbo[Sense]:
|
||||
Chrome curl-like plugin for running requests against an Elasticsearch node
|
||||
|
||||
* https://github.com/mobz/elasticsearch-head[elasticsearch-head]:
|
||||
A web front end for an Elasticsearch cluster.
|
||||
|
||||
|
@ -15,3 +12,6 @@
|
|||
|
||||
* http://elastichammer.exploringelasticsearch.com/[Hammer]:
|
||||
Web front-end for elasticsearch
|
||||
|
||||
* https://github.com/romansanchez/Calaca[Calaca]:
|
||||
Simple search client for Elasticsearch
|
||||
|
|
|
@ -35,8 +35,8 @@
|
|||
* https://drupal.org/project/elasticsearch_connector[Drupal]:
|
||||
Drupal Elasticsearch integration (1.0.0 and later).
|
||||
|
||||
* http://drupal.org/project/elasticsearch[Drupal]:
|
||||
Drupal Elasticsearch integration (0.90 and earlier).
|
||||
* http://drupal.org/project/search_api_elasticsearch[Drupal]:
|
||||
Drupal Elasticsearch integration via Search API (1.0.0 and earlier).
|
||||
|
||||
* https://github.com/refuge/couch_es[couch_es]:
|
||||
elasticsearch helper for couchdb based products (apache couchdb, bigcouch & refuge)
|
||||
|
@ -88,3 +88,7 @@
|
|||
|
||||
* https://github.com/twitter/storehaus[Twitter Storehaus]:
|
||||
Thin asynchronous scala client for storehaus.
|
||||
|
||||
* https://doc.tiki.org/Elasticsearch[Tiki Wiki CMS Groupware]
|
||||
Tiki has native support for Elasticsearch. This provides faster & better search (facets, etc), along with some Natural Language Processing features (ex.: More like this)
|
||||
|
||||
|
|
|
@ -7,6 +7,9 @@
|
|||
* http://github.com/elasticsearch/cookbook-elasticsearch[Chef]:
|
||||
Chef cookbook for Elasticsearch
|
||||
|
||||
* https://github.com/medcl/salt-elasticsearch[SaltStack]:
|
||||
SaltStack Module for Elasticsearch
|
||||
|
||||
* http://www.github.com/neogenix/daikon[daikon]:
|
||||
Daikon Elasticsearch CLI
|
||||
|
||||
|
|
|
@ -54,12 +54,12 @@ different clusters by simply setting the `cluster.name` setting, or
|
|||
explicitly using the `clusterName` method on the builder.
|
||||
|
||||
You can define `cluster.name` in the `/src/main/resources/elasticsearch.yml`
|
||||
dir in your project. As long as `elasticsearch.yml` is present in the
|
||||
file in your project. As long as `elasticsearch.yml` is present in the
|
||||
classpath, it will be used when you start your node.
|
||||
|
||||
[source,java]
|
||||
[source,yaml]
|
||||
--------------------------------------------------
|
||||
cluster.name=yourclustername
|
||||
cluster.name: yourclustername
|
||||
--------------------------------------------------
|
||||
|
||||
Or in Java:
|
||||
|
|
|
@ -8,7 +8,7 @@ index and make it searchable.
|
|||
[[generate]]
|
||||
=== Generate JSON document
|
||||
|
||||
There are different way of generating a JSON document:
|
||||
There are several different ways of generating a JSON document:
|
||||
|
||||
* Manually (aka do it yourself) using native `byte[]` or as a `String`
|
||||
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
[[indexed-scripts]]
|
||||
== Indexed Scripts API
|
||||
|
||||
The indexed script API allows one to interact with scripts and templates
|
||||
stored in an elasticsearch index. It can be used to create, update, get,
|
||||
and delete indexed scripts and templates.
|
||||
|
||||
[source,java]
|
||||
--------------------------------------------------
|
||||
PutIndexedScriptResponse = client.preparePutIndexedScript()
|
||||
.setScriptLang("groovy")
|
||||
.setId("script1")
|
||||
.setSource("_score * doc['my_numeric_field'].value")
|
||||
.execute()
|
||||
.actionGet();
|
||||
|
||||
GetIndexedScriptResponse = client.prepareGetIndexedScript()
|
||||
.setScriptLang("groovy")
|
||||
.setId("script1")
|
||||
.execute()
|
||||
.actionGet();
|
||||
|
||||
DeleteIndexedScriptResponse = client.prepareDeleteIndexedScript()
|
||||
.setScriptLang("groovy")
|
||||
.setId("script1")
|
||||
.execute()
|
||||
.actionGet();
|
||||
--------------------------------------------------
|
||||
|
||||
To store templates simply use "mustache" for the scriptLang.
|
||||
|
||||
=== Script Language
|
||||
|
||||
The API allows one to set the language of the indexed script being
|
||||
interacted with. If one is not provided the default scripting language
|
||||
will be used.
|
|
@ -23,30 +23,53 @@ following types are supported:
|
|||
<<hindi-analyzer,`hindi`>>,
|
||||
<<hungarian-analyzer,`hungarian`>>,
|
||||
<<indonesian-analyzer,`indonesian`>>,
|
||||
<<irish-analyzer,`irish`>>,
|
||||
<<italian-analyzer,`italian`>>,
|
||||
<<norwegian-analyzer,`norwegian`>>,
|
||||
<<persian-analyzer,`persian`>>,
|
||||
<<portuguese-analyzer,`portuguese`>>,
|
||||
<<romanian-analyzer,`romanian`>>,
|
||||
<<russian-analyzer,`russian`>>,
|
||||
<<sorani-analyzer,`sorani`>>,
|
||||
<<spanish-analyzer,`spanish`>>,
|
||||
<<swedish-analyzer,`swedish`>>,
|
||||
<<turkish-analyzer,`turkish`>>,
|
||||
<<thai-analyzer,`thai`>>.
|
||||
|
||||
==== Configuring language analyzers
|
||||
|
||||
===== Stopwords
|
||||
|
||||
All analyzers support setting custom `stopwords` either internally in
|
||||
the config, or by using an external stopwords file by setting
|
||||
`stopwords_path`. Check <<analysis-stop-analyzer,Stop Analyzer>> for
|
||||
more details.
|
||||
|
||||
===== Excluding words from stemming
|
||||
|
||||
The `stem_exclusion` parameter allows you to specify an array
|
||||
of lowercase words that should not be stemmed. Internally, this
|
||||
functionality is implemented by adding the
|
||||
<<analysis-keyword-marker-tokenfilter,`keyword_marker` token filter>>
|
||||
with the `keywords` set to the value of the `stem_exclusion` parameter.
|
||||
|
||||
The following analyzers support setting custom `stem_exclusion` list:
|
||||
`arabic`, `armenian`, `basque`, `catalan`, `bulgarian`, `catalan`,
|
||||
`czech`, `finnish`, `dutch`, `english`, `finnish`, `french`, `galician`,
|
||||
`german`, `hindi`, `hungarian`, `indonesian`, `italian`, `norwegian`,
|
||||
`portuguese`, `romanian`, `russian`, `spanish`, `swedish`, `turkish`.
|
||||
`german`, `irish`, `hindi`, `hungarian`, `indonesian`, `italian`, `norwegian`,
|
||||
`portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `turkish`.
|
||||
|
||||
==== Reimplementing language analyzers
|
||||
|
||||
The built-in language analyzers can be reimplemented as `custom` analyzers
|
||||
(as described below) in order to customize their behaviour.
|
||||
|
||||
NOTE: If you do not intend to exclude words from being stemmed (the
|
||||
equivalent of the `stem_exclusion` parameter above), then you should remove
|
||||
the `keyword_marker` token filter from the custom analyzer configuration.
|
||||
|
||||
[[arabic-analyzer]]
|
||||
==== `arabic` analyzer
|
||||
===== `arabic` analyzer
|
||||
|
||||
The `arabic` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -87,11 +110,11 @@ The `arabic` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[armenian-analyzer]]
|
||||
==== `armenian` analyzer
|
||||
===== `armenian` analyzer
|
||||
|
||||
The `armenian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -131,11 +154,11 @@ The `armenian` analyzer could be reimplemented as a `custom` analyzer as follows
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[basque-analyzer]]
|
||||
==== `basque` analyzer
|
||||
===== `basque` analyzer
|
||||
|
||||
The `basque` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -175,11 +198,11 @@ The `basque` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[brazilian-analyzer]]
|
||||
==== `brazilian` analyzer
|
||||
===== `brazilian` analyzer
|
||||
|
||||
The `brazilian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -219,11 +242,11 @@ The `brazilian` analyzer could be reimplemented as a `custom` analyzer as follow
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[bulgarian-analyzer]]
|
||||
==== `bulgarian` analyzer
|
||||
===== `bulgarian` analyzer
|
||||
|
||||
The `bulgarian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -263,11 +286,11 @@ The `bulgarian` analyzer could be reimplemented as a `custom` analyzer as follow
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[catalan-analyzer]]
|
||||
==== `catalan` analyzer
|
||||
===== `catalan` analyzer
|
||||
|
||||
The `catalan` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -312,11 +335,11 @@ The `catalan` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[chinese-analyzer]]
|
||||
==== `chinese` analyzer
|
||||
===== `chinese` analyzer
|
||||
|
||||
The `chinese` analyzer cannot be reimplemented as a `custom` analyzer
|
||||
because it depends on the ChineseTokenizer and ChineseFilter classes,
|
||||
|
@ -325,7 +348,7 @@ deprecated in Lucene 4 and the `chinese` analyzer will be replaced
|
|||
with the <<analysis-standard-analyzer>> in Lucene 5.
|
||||
|
||||
[[cjk-analyzer]]
|
||||
==== `cjk` analyzer
|
||||
===== `cjk` analyzer
|
||||
|
||||
The `cjk` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -359,7 +382,7 @@ The `cjk` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
or `stopwords_path` parameters.
|
||||
|
||||
[[czech-analyzer]]
|
||||
==== `czech` analyzer
|
||||
===== `czech` analyzer
|
||||
|
||||
The `czech` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -399,11 +422,11 @@ The `czech` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[danish-analyzer]]
|
||||
==== `danish` analyzer
|
||||
===== `danish` analyzer
|
||||
|
||||
The `danish` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -443,11 +466,11 @@ The `danish` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[dutch-analyzer]]
|
||||
==== `dutch` analyzer
|
||||
===== `dutch` analyzer
|
||||
|
||||
The `dutch` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -497,11 +520,11 @@ The `dutch` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[english-analyzer]]
|
||||
==== `english` analyzer
|
||||
===== `english` analyzer
|
||||
|
||||
The `english` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -546,11 +569,11 @@ The `english` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[finnish-analyzer]]
|
||||
==== `finnish` analyzer
|
||||
===== `finnish` analyzer
|
||||
|
||||
The `finnish` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -590,11 +613,11 @@ The `finnish` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[french-analyzer]]
|
||||
==== `french` analyzer
|
||||
===== `french` analyzer
|
||||
|
||||
The `french` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -642,11 +665,11 @@ The `french` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[galician-analyzer]]
|
||||
==== `galician` analyzer
|
||||
===== `galician` analyzer
|
||||
|
||||
The `galician` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -686,11 +709,11 @@ The `galician` analyzer could be reimplemented as a `custom` analyzer as follows
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[german-analyzer]]
|
||||
==== `german` analyzer
|
||||
===== `german` analyzer
|
||||
|
||||
The `german` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -720,7 +743,7 @@ The `german` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
"lowercase",
|
||||
"german_stop",
|
||||
"german_keywords",
|
||||
"ascii_folding", <3>
|
||||
"german_normalization",
|
||||
"german_stemmer"
|
||||
]
|
||||
}
|
||||
|
@ -731,14 +754,11 @@ The `german` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<3> The `german` analyzer actually uses the GermanNormalizationFilter,
|
||||
which isn't exposed in Elasticsearch. The `ascii_folding` filter
|
||||
does a similar job but is more extensive.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[greek-analyzer]]
|
||||
==== `greek` analyzer
|
||||
===== `greek` analyzer
|
||||
|
||||
The `greek` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -752,6 +772,10 @@ The `greek` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
"type": "stop",
|
||||
"stopwords": "_greek_" <1>
|
||||
},
|
||||
"greek_lowercase": {
|
||||
"type": "lowercase",
|
||||
"language": "greek"
|
||||
},
|
||||
"greek_keywords": {
|
||||
"type": "keyword_marker",
|
||||
"keywords": [] <2>
|
||||
|
@ -765,7 +789,7 @@ The `greek` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
"greek": {
|
||||
"tokenizer": "standard",
|
||||
"filter": [
|
||||
"lowercase",
|
||||
"greek_lowercase",
|
||||
"greek_stop",
|
||||
"greek_keywords",
|
||||
"greek_stemmer"
|
||||
|
@ -778,18 +802,57 @@ The `greek` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[hindi-analyzer]]
|
||||
==== `hindi` analyzer
|
||||
===== `hindi` analyzer
|
||||
|
||||
The `hindi` analyzer cannot currently be implemented as a `custom` analyzer
|
||||
as it depends on the IndicNormalizationFilter and HindiNormalizationFilter
|
||||
which are not yet exposed by Elasticsearch. Instead, see the <<analysis-icu-plugin>>.
|
||||
The `hindi` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
[source,js]
|
||||
----------------------------------------------------
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"hindi_stop": {
|
||||
"type": "stop",
|
||||
"stopwords": "_hindi_" <1>
|
||||
},
|
||||
"hindi_keywords": {
|
||||
"type": "keyword_marker",
|
||||
"keywords": [] <2>
|
||||
},
|
||||
"hindi_stemmer": {
|
||||
"type": "stemmer",
|
||||
"language": "hindi"
|
||||
}
|
||||
},
|
||||
"analyzer": {
|
||||
"hindi": {
|
||||
"tokenizer": "standard",
|
||||
"filter": [
|
||||
"lowercase",
|
||||
"indic_normalization",
|
||||
"hindi_normalization",
|
||||
"hindi_stop",
|
||||
"hindi_keywords",
|
||||
"hindi_stemmer"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[hungarian-analyzer]]
|
||||
==== `hungarian` analyzer
|
||||
===== `hungarian` analyzer
|
||||
|
||||
The `hungarian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -829,12 +892,12 @@ The `hungarian` analyzer could be reimplemented as a `custom` analyzer as follow
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
|
||||
[[indonesian-analyzer]]
|
||||
==== `indonesian` analyzer
|
||||
===== `indonesian` analyzer
|
||||
|
||||
The `indonesian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -874,11 +937,64 @@ The `indonesian` analyzer could be reimplemented as a `custom` analyzer as follo
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[irish-analyzer]]
|
||||
===== `irish` analyzer
|
||||
|
||||
The `irish` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
[source,js]
|
||||
----------------------------------------------------
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"irish_elision": {
|
||||
"type": "elision",
|
||||
"articles": [ "h", "n", "t" ]
|
||||
},
|
||||
"irish_stop": {
|
||||
"type": "stop",
|
||||
"stopwords": "_irish_" <1>
|
||||
},
|
||||
"irish_lowercase": {
|
||||
"type": "lowercase",
|
||||
"language": "irish"
|
||||
},
|
||||
"irish_keywords": {
|
||||
"type": "keyword_marker",
|
||||
"keywords": [] <2>
|
||||
},
|
||||
"irish_stemmer": {
|
||||
"type": "stemmer",
|
||||
"language": "irish"
|
||||
}
|
||||
},
|
||||
"analyzer": {
|
||||
"irish": {
|
||||
"tokenizer": "standard",
|
||||
"filter": [
|
||||
"irish_stop",
|
||||
"irish_elision",
|
||||
"irish_lowercase",
|
||||
"irish_keywords",
|
||||
"irish_stemmer"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[italian-analyzer]]
|
||||
==== `italian` analyzer
|
||||
===== `italian` analyzer
|
||||
|
||||
The `italian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -928,11 +1044,11 @@ The `italian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[norwegian-analyzer]]
|
||||
==== `norwegian` analyzer
|
||||
===== `norwegian` analyzer
|
||||
|
||||
The `norwegian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -972,11 +1088,11 @@ The `norwegian` analyzer could be reimplemented as a `custom` analyzer as follow
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[persian-analyzer]]
|
||||
==== `persian` analyzer
|
||||
===== `persian` analyzer
|
||||
|
||||
The `persian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -1018,7 +1134,7 @@ The `persian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
or `stopwords_path` parameters.
|
||||
|
||||
[[portuguese-analyzer]]
|
||||
==== `portuguese` analyzer
|
||||
===== `portuguese` analyzer
|
||||
|
||||
The `portuguese` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -1058,11 +1174,11 @@ The `portuguese` analyzer could be reimplemented as a `custom` analyzer as follo
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[romanian-analyzer]]
|
||||
==== `romanian` analyzer
|
||||
===== `romanian` analyzer
|
||||
|
||||
The `romanian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -1102,12 +1218,12 @@ The `romanian` analyzer could be reimplemented as a `custom` analyzer as follows
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
|
||||
[[russian-analyzer]]
|
||||
==== `russian` analyzer
|
||||
===== `russian` analyzer
|
||||
|
||||
The `russian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -1147,11 +1263,56 @@ The `russian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[sorani-analyzer]]
|
||||
===== `sorani` analyzer
|
||||
|
||||
The `sorani` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
[source,js]
|
||||
----------------------------------------------------
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"sorani_stop": {
|
||||
"type": "stop",
|
||||
"stopwords": "_sorani_" <1>
|
||||
},
|
||||
"sorani_keywords": {
|
||||
"type": "keyword_marker",
|
||||
"keywords": [] <2>
|
||||
},
|
||||
"sorani_stemmer": {
|
||||
"type": "stemmer",
|
||||
"language": "sorani"
|
||||
}
|
||||
},
|
||||
"analyzer": {
|
||||
"sorani": {
|
||||
"tokenizer": "standard",
|
||||
"filter": [
|
||||
"sorani_normalization",
|
||||
"lowercase",
|
||||
"sorani_stop",
|
||||
"sorani_keywords",
|
||||
"sorani_stemmer"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[spanish-analyzer]]
|
||||
==== `spanish` analyzer
|
||||
===== `spanish` analyzer
|
||||
|
||||
The `spanish` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -1191,11 +1352,11 @@ The `spanish` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[swedish-analyzer]]
|
||||
==== `swedish` analyzer
|
||||
===== `swedish` analyzer
|
||||
|
||||
The `swedish` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
|
@ -1235,20 +1396,86 @@ The `swedish` analyzer could be reimplemented as a `custom` analyzer as follows:
|
|||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> Words can be excluded from stemming with the `stem_exclusion`
|
||||
parameter.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[turkish-analyzer]]
|
||||
==== `turkish` analyzer
|
||||
===== `turkish` analyzer
|
||||
|
||||
The `turkish` analyzer cannot currently be implemented as a `custom` analyzer
|
||||
because it depends on the TurkishLowerCaseFilter and the ApostropheFilter
|
||||
which are not exposed in Elasticsearch. Instead, see the <<analysis-icu-plugin>>.
|
||||
The `turkish` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
[source,js]
|
||||
----------------------------------------------------
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"turkish_stop": {
|
||||
"type": "stop",
|
||||
"stopwords": "_turkish_" <1>
|
||||
},
|
||||
"turkish_lowercase": {
|
||||
"type": "lowercase",
|
||||
"language": "turkish"
|
||||
},
|
||||
"turkish_keywords": {
|
||||
"type": "keyword_marker",
|
||||
"keywords": [] <2>
|
||||
},
|
||||
"turkish_stemmer": {
|
||||
"type": "stemmer",
|
||||
"language": "turkish"
|
||||
}
|
||||
},
|
||||
"analyzer": {
|
||||
"turkish": {
|
||||
"tokenizer": "standard",
|
||||
"filter": [
|
||||
"apostrophe",
|
||||
"turkish_lowercase",
|
||||
"turkish_stop",
|
||||
"turkish_keywords",
|
||||
"turkish_stemmer"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[thai-analyzer]]
|
||||
==== `thai` analyzer
|
||||
===== `thai` analyzer
|
||||
|
||||
The `thai` analyzer cannot currently be implemented as a `custom` analyzer
|
||||
because it depends on the ThaiTokenizer which is not exposed in Elasticsearch.
|
||||
Instead, see the <<analysis-icu-plugin>>.
|
||||
The `thai` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
[source,js]
|
||||
----------------------------------------------------
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"thai_stop": {
|
||||
"type": "stop",
|
||||
"stopwords": "_thai_" <1>
|
||||
}
|
||||
},
|
||||
"analyzer": {
|
||||
"thai": {
|
||||
"tokenizer": "thai",
|
||||
"filter": [
|
||||
"lowercase",
|
||||
"thai_stop"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------------------------------
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
|
|
|
@ -218,3 +218,29 @@ Breaks text into words according to UAX #29: Unicode Text Segmentation ((http://
|
|||
}
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
[float]
|
||||
=== ICU Normalization CharFilter
|
||||
|
||||
Normalizes characters as explained http://userguide.icu-project.org/transforms/normalization[here].
|
||||
It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings.
|
||||
Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`.
|
||||
Allows for the mode parameter to be provided which can include the following values: `compose` and `decompose`.
|
||||
Use `decompose` with `nfc` or `nfkc`, to get `nfd` or `nfkd`, respectively.
|
||||
Here is a sample settings:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"collation" : {
|
||||
"tokenizer" : "keyword",
|
||||
"char_filter" : ["icu_normalizer"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
|
|
@ -78,3 +78,7 @@ include::tokenfilters/cjk-bigram-tokenfilter.asciidoc[]
|
|||
include::tokenfilters/delimited-payload-tokenfilter.asciidoc[]
|
||||
|
||||
include::tokenfilters/keep-words-tokenfilter.asciidoc[]
|
||||
|
||||
include::tokenfilters/classic-tokenfilter.asciidoc[]
|
||||
|
||||
include::tokenfilters/apostrophe-tokenfilter.asciidoc[]
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
[[analysis-apostrophe-tokenfilter]]
|
||||
=== Apostrophe Token Filter
|
||||
|
||||
added[1.3.0]
|
||||
|
||||
The `apostrophe` token filter strips all characters after an apostrophe,
|
||||
including the apostrophe itself.
|
|
@ -0,0 +1,11 @@
|
|||
[[analysis-classic-tokenfilter]]
|
||||
=== Classic Token Filter
|
||||
|
||||
added[1.3.0]
|
||||
|
||||
The `classic` token filter does optional post-processing of
|
||||
terms that are generated by the <<analysis-classic-tokenizer,`classic` tokenizer>>.
|
||||
|
||||
This filter removes the english possessive from the end of words, and
|
||||
it removes dots from acronyms.
|
||||
|
|
@ -5,10 +5,10 @@ Basic support for hunspell stemming. Hunspell dictionaries will be
|
|||
picked up from a dedicated hunspell directory on the filesystem
|
||||
(defaults to `<path.conf>/hunspell`). Each dictionary is expected to
|
||||
have its own directory named after its associated locale (language).
|
||||
This dictionary directory is expected to hold both the `*.aff` and `*.dic`
|
||||
files (all of which will automatically be picked up). For example,
|
||||
assuming the default hunspell location is used, the following directory
|
||||
layout will define the `en_US` dictionary:
|
||||
This dictionary directory is expected to hold a single `*.aff` and
|
||||
one or more `*.dic` files (all of which will automatically be picked up).
|
||||
For example, assuming the default hunspell location is used, the
|
||||
following directory layout will define the `en_US` dictionary:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
@ -25,7 +25,7 @@ _elasticsearch.yml_.
|
|||
|
||||
Each dictionary can be configured with one setting:
|
||||
|
||||
`ignore_case`::
|
||||
`ignore_case`::
|
||||
If true, dictionary matching will be case insensitive
|
||||
(defaults to `false`)
|
||||
|
||||
|
@ -67,20 +67,20 @@ settings:
|
|||
|
||||
The hunspell token filter accepts four options:
|
||||
|
||||
`locale`::
|
||||
`locale`::
|
||||
A locale for this filter. If this is unset, the `lang` or
|
||||
`language` are used instead - so one of these has to be set.
|
||||
|
||||
`dictionary`::
|
||||
`dictionary`::
|
||||
The name of a dictionary. The path to your hunspell
|
||||
dictionaries should be configured via
|
||||
`indices.analysis.hunspell.dictionary.location` before.
|
||||
|
||||
`dedup`::
|
||||
`dedup`::
|
||||
If only unique terms should be returned, this needs to be
|
||||
set to `true`. Defaults to `true`.
|
||||
|
||||
`longest_only`::
|
||||
`longest_only`::
|
||||
If only the longest term should be returned, set this to `true`.
|
||||
Defaults to `false`: all possible stems are returned.
|
||||
|
||||
|
@ -88,6 +88,16 @@ NOTE: As opposed to the snowball stemmers (which are algorithm based)
|
|||
this is a dictionary lookup based stemmer and therefore the quality of
|
||||
the stemming is determined by the quality of the dictionary.
|
||||
|
||||
[float]
|
||||
==== Dictionary loading
|
||||
|
||||
By default, the configured (`indices.analysis.hunspell.dictionary.location`)
|
||||
or default Hunspell directory (`config/hunspell/`) is checked for dictionaries
|
||||
when the node starts up, and any dictionaries are automatically loaded.
|
||||
|
||||
Dictionary loading can be deferred until they are actually used by setting
|
||||
`indices.analysis.hunspell.dictionary.lazy` to `true`in the config file.
|
||||
|
||||
[float]
|
||||
==== References
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
A token filter of type `lowercase` that normalizes token text to lower
|
||||
case.
|
||||
|
||||
Lowercase token filter supports Greek and Turkish lowercase token
|
||||
Lowercase token filter supports Greek, Irish added[1.3.0], and Turkish lowercase token
|
||||
filters through the `language` parameter. Below is a usage example in a
|
||||
custom analyzer
|
||||
|
||||
|
|
|
@ -4,12 +4,33 @@
|
|||
There are several token filters available which try to normalize special
|
||||
characters of a certain language.
|
||||
|
||||
You can currently choose between `arabic_normalization` and
|
||||
`persian_normalization` normalization in your token filter
|
||||
configuration. For more information check the
|
||||
http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html[ArabicNormalizer]
|
||||
or the
|
||||
http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html[PersianNormalizer]
|
||||
documentation.
|
||||
[horizontal]
|
||||
Arabic::
|
||||
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html[`arabic_normalization`]
|
||||
|
||||
German::
|
||||
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html[`german_normalization`] added[1.3.0]
|
||||
|
||||
Hindi::
|
||||
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/hi/HindiNormalizer.html[`hindi_normalization`] added[1.3.0]
|
||||
|
||||
Indic::
|
||||
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/in/IndicNormalizer.html[`indic_normalization`] added[1.3.0]
|
||||
|
||||
Kurdish (Sorani)::
|
||||
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html[`sorani_normalization`] added[1.3.0]
|
||||
|
||||
Persian::
|
||||
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html[`persian_normalization`]
|
||||
|
||||
Scandinavian::
|
||||
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html[`scandinavian_normalization`] added[1.3.0],
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html[`scandinavian_folding`] added[1.3.0]
|
||||
|
||||
*Note:* These filters are available since `0.90.2`
|
||||
|
|
|
@ -5,7 +5,7 @@ Overrides stemming algorithms, by applying a custom mapping, then
|
|||
protecting these terms from being modified by stemmers. Must be placed
|
||||
before any stemming filters.
|
||||
|
||||
Rules are separated by "=>"
|
||||
Rules are separated by `=>`
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|
|
|
@ -32,7 +32,7 @@ available values (the preferred filters are marked in *bold*):
|
|||
[horizontal]
|
||||
Arabic::
|
||||
|
||||
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Far%2FArabicStemmer.html[*`arabic`*]
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/ar/ArabicStemmer.html[*`arabic`*]
|
||||
|
||||
Armenian::
|
||||
|
||||
|
@ -44,7 +44,7 @@ http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*]
|
|||
|
||||
Brazilian Portuguese::
|
||||
|
||||
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fbr%2FBrazilianStemmer.html[*`brazilian`*]
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/br/BrazilianStemmer.html[*`brazilian`*]
|
||||
|
||||
Bulgarian::
|
||||
|
||||
|
@ -65,15 +65,15 @@ http://snowball.tartarus.org/algorithms/danish/stemmer.html[*`danish`*]
|
|||
Dutch::
|
||||
|
||||
http://snowball.tartarus.org/algorithms/dutch/stemmer.html[*`dutch`*],
|
||||
http://snowball.tartarus.org/algorithms/kraaij_pohlmann/stemmer.html[`dutch_kp`] coming[1.3.0,Renamed from `kp`]
|
||||
http://snowball.tartarus.org/algorithms/kraaij_pohlmann/stemmer.html[`dutch_kp`] added[1.3.0,Renamed from `kp`]
|
||||
|
||||
English::
|
||||
|
||||
http://snowball.tartarus.org/algorithms/porter/stemmer.html[*`english`*] coming[1.3.0,Returns the <<analysis-porterstem-tokenfilter,`porter_stem`>> instead of the <<analysis-snowball-tokenfilter,`english` Snowball token filter>>],
|
||||
http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[`light_english`] coming[1.3.0,Returns the <<analysis-kstem-tokenfilter,`kstem` token filter>>],
|
||||
http://snowball.tartarus.org/algorithms/porter/stemmer.html[*`english`*] added[1.3.0,Returns the <<analysis-porterstem-tokenfilter,`porter_stem`>> instead of the <<analysis-snowball-tokenfilter,`english` Snowball token filter>>],
|
||||
http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[`light_english`] added[1.3.0,Returns the <<analysis-kstem-tokenfilter,`kstem` token filter>>],
|
||||
http://www.medialab.tfe.umu.se/courses/mdm0506a/material/fulltext_ID%3D10049387%26PLACEBO%3DIE.pdf[`minimal_english`],
|
||||
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fen%2FEnglishPossessiveFilter.html[`possessive_english`],
|
||||
http://snowball.tartarus.org/algorithms/english/stemmer.html[`porter2`] coming[1.3.0,Returns the <<analysis-snowball-tokenfilter,`english` Snowball token filter>> instead of the <<analysis-snowball-tokenfilter,`porter` Snowball token filter>>],
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/en/EnglishPossessiveFilter.html[`possessive_english`],
|
||||
http://snowball.tartarus.org/algorithms/english/stemmer.html[`porter2`] added[1.3.0,Returns the <<analysis-snowball-tokenfilter,`english` Snowball token filter>> instead of the <<analysis-snowball-tokenfilter,`porter` Snowball token filter>>],
|
||||
http://snowball.tartarus.org/algorithms/lovins/stemmer.html[`lovins`]
|
||||
|
||||
Finnish::
|
||||
|
@ -87,6 +87,11 @@ http://snowball.tartarus.org/algorithms/french/stemmer.html[`french`],
|
|||
http://dl.acm.org/citation.cfm?id=1141523[*`light_french`*],
|
||||
http://dl.acm.org/citation.cfm?id=318984[`minimal_french`]
|
||||
|
||||
Galician::
|
||||
|
||||
http://bvg.udc.es/recursos_lingua/stemming.jsp[*`galician`*] added[1.3.0],
|
||||
http://bvg.udc.es/recursos_lingua/stemming.jsp[`minimal_galician`] (Plural step only) added[1.3.0]
|
||||
|
||||
German::
|
||||
|
||||
http://snowball.tartarus.org/algorithms/german/stemmer.html[`german`],
|
||||
|
@ -111,27 +116,40 @@ Indonesian::
|
|||
|
||||
http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf[*`indonesian`*]
|
||||
|
||||
Irish::
|
||||
|
||||
http://snowball.tartarus.org/otherapps/oregan/intro.html[*`irish`*]
|
||||
|
||||
Italian::
|
||||
|
||||
http://snowball.tartarus.org/algorithms/italian/stemmer.html[`italian`],
|
||||
http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[*`light_italian`*]
|
||||
|
||||
Kurdish (Sorani)::
|
||||
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/ckb/SoraniStemmer.html[*`sorani`*] added[1.3.0]
|
||||
|
||||
Latvian::
|
||||
|
||||
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Flv%2FLatvianStemmer.html[*`latvian`*]
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/lv/LatvianStemmer.html[*`latvian`*]
|
||||
|
||||
Norwegian::
|
||||
Norwegian (Bokmål)::
|
||||
|
||||
http://snowball.tartarus.org/algorithms/norwegian/stemmer.html[*`norwegian`*],
|
||||
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fno%2FNorwegianMinimalStemFilter.html[`minimal_norwegian`]
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/no/NorwegianLightStemmer.html[*`light_norwegian`*] added[1.3.0],
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.html[`minimal_norwegian`]
|
||||
|
||||
Norwegian (Nynorsk)::
|
||||
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/no/NorwegianLightStemmer.html[*`light_nynorsk`*] added[1.3.0],
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.html[`minimal_nynorsk`] added[1.3.0]
|
||||
|
||||
Portuguese::
|
||||
|
||||
http://snowball.tartarus.org/algorithms/portuguese/stemmer.html[`portuguese`],
|
||||
http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[*`light_portuguese`*],
|
||||
http://www.inf.ufrgs.br/\~buriol/papers/Orengo_CLEF07.pdf[`minimal_portuguese`],
|
||||
http://www.inf.ufrgs.br/\~viviane/rslp/index.htm[`portuguese_rslp`] coming[1.3.0]
|
||||
|
||||
http://www.inf.ufrgs.br/\~viviane/rslp/index.htm[`portuguese_rslp`] added[1.3.0]
|
||||
|
||||
Romanian::
|
||||
|
||||
|
|
|
@ -7,29 +7,72 @@ streams.
|
|||
The following are settings that can be set for a `stop` token filter
|
||||
type:
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description
|
||||
|`stopwords` |A list of stop words to use. Defaults to english stop
|
||||
words.
|
||||
[horizontal]
|
||||
`stopwords`::
|
||||
|
||||
|`stopwords_path` |A path (either relative to `config` location, or
|
||||
absolute) to a stopwords file configuration. Each stop word should be in
|
||||
its own "line" (separated by a line break). The file must be UTF-8
|
||||
encoded.
|
||||
A list of stop words to use. Defaults to `_english_` stop words.
|
||||
|
||||
|`ignore_case` |Set to `true` to lower case all words first. Defaults to
|
||||
`false`.
|
||||
`stopwords_path`::
|
||||
|
||||
|`remove_trailing` |Set to `false` in order to not ignore the last term of
|
||||
a search if it is a stop word. This is very useful for the completion
|
||||
suggester as a query like `green a` can be extended to `green apple` even
|
||||
though you remove stop words in general. Defaults to `true`.
|
||||
|=======================================================================
|
||||
A path (either relative to `config` location, or absolute) to a stopwords
|
||||
file configuration. Each stop word should be in its own "line" (separated
|
||||
by a line break). The file must be UTF-8 encoded.
|
||||
|
||||
stopwords allow for custom language specific expansion of default
|
||||
stopwords. It follows the `_lang_` notation and supports: arabic,
|
||||
armenian, basque, brazilian, bulgarian, catalan, czech, danish, dutch,
|
||||
english, finnish, french, galician, german, greek, hindi, hungarian,
|
||||
indonesian, italian, norwegian, persian, portuguese, romanian, russian,
|
||||
spanish, swedish, turkish.
|
||||
`ignore_case`::
|
||||
|
||||
Set to `true` to lower case all words first. Defaults to `false`.
|
||||
|
||||
`remove_trailing`::
|
||||
|
||||
Set to `false` in order to not ignore the last term of a search if it is a
|
||||
stop word. This is very useful for the completion suggester as a query
|
||||
like `green a` can be extended to `green apple` even though you remove
|
||||
stop words in general. Defaults to `true`.
|
||||
|
||||
The `stopwords` parameter accepts either an array of stopwords:
|
||||
|
||||
[source,json]
|
||||
------------------------------------
|
||||
PUT /my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"my_stop": {
|
||||
"type": "stop",
|
||||
"stopwords": ["and", "is", "the"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
------------------------------------
|
||||
|
||||
or a predefined language-specific list:
|
||||
|
||||
[source,json]
|
||||
------------------------------------
|
||||
PUT /my_index
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"my_stop": {
|
||||
"type": "stop",
|
||||
"stopwords": "_english_"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
------------------------------------
|
||||
|
||||
Elasticsearch provides the following predefined list of languages:
|
||||
|
||||
`_arabic_`, `_armenian_`, `_basque_`, `_brazilian_`, `_bulgarian_`,
|
||||
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`,
|
||||
`_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`,
|
||||
`_indonesian_`, `_italian_`, `_norwegian_`, `_persian_`, `_portuguese_`,
|
||||
`_romanian_`, `_russian_`, `_spanish_`, `_swedish_`, `_turkish_`.
|
||||
|
||||
For the empty stopwords list (to disable stopwords) use: `_none_`.
|
||||
|
|
|
@ -28,3 +28,7 @@ include::tokenizers/uaxurlemail-tokenizer.asciidoc[]
|
|||
|
||||
include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/classic-tokenizer.asciidoc[]
|
||||
|
||||
include::tokenizers/thai-tokenizer.asciidoc[]
|
||||
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
[[analysis-classic-tokenizer]]
|
||||
=== Classic Tokenizer
|
||||
|
||||
added[1.3.0]
|
||||
|
||||
A tokenizer of type `classic` providing grammar based tokenizer that is
|
||||
a good tokenizer for English language documents. This tokenizer has
|
||||
heuristics for special treatment of acronyms, company names, email addresses,
|
||||
and internet host names. However, these rules don't always work, and
|
||||
the tokenizer doesn't work well for most languages other than English.
|
||||
|
||||
The following are settings that can be set for a `classic` tokenizer
|
||||
type:
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description
|
||||
|`max_token_length` |The maximum token length. If a token is seen that
|
||||
exceeds this length then it is discarded. Defaults to `255`.
|
||||
|=======================================================================
|
||||
|
|
@ -7,7 +7,7 @@ via a regular expression. Accepts the following settings:
|
|||
[cols="<,<",options="header",]
|
||||
|======================================================================
|
||||
|Setting |Description
|
||||
|`pattern` |The regular expression pattern, defaults to `\\W+`.
|
||||
|`pattern` |The regular expression pattern, defaults to `\W+`.
|
||||
|`flags` |The regular expression flags.
|
||||
|`group` |Which group to extract into tokens. Defaults to `-1` (split).
|
||||
|======================================================================
|
||||
|
@ -15,15 +15,24 @@ via a regular expression. Accepts the following settings:
|
|||
*IMPORTANT*: The regular expression should match the *token separators*,
|
||||
not the tokens themselves.
|
||||
|
||||
*********************************************
|
||||
Note that you may need to escape `pattern` string literal according to
|
||||
your client language rules. For example, in many programming languages
|
||||
a string literal for `\W+` pattern is written as `"\\W+"`.
|
||||
There is nothing special about `pattern` (you may have to escape other
|
||||
string literals as well); escaping `pattern` is common just because it
|
||||
often contains characters that should be escaped.
|
||||
*********************************************
|
||||
|
||||
`group` set to `-1` (the default) is equivalent to "split". Using group
|
||||
>= 0 selects the matching group as the token. For example, if you have:
|
||||
|
||||
------------------------
|
||||
pattern = \\'([^\']+)\\'
|
||||
pattern = '([^']+)'
|
||||
group = 0
|
||||
input = aaa 'bbb' 'ccc'
|
||||
------------------------
|
||||
|
||||
the output will be two tokens: 'bbb' and 'ccc' (including the ' marks).
|
||||
With the same input but using group=1, the output would be: bbb and ccc
|
||||
(no ' marks).
|
||||
the output will be two tokens: `'bbb'` and `'ccc'` (including the `'`
|
||||
marks). With the same input but using group=1, the output would be:
|
||||
`bbb` and `ccc` (no `'` marks).
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
[[analysis-thai-tokenizer]]
|
||||
=== Thai Tokenizer
|
||||
|
||||
added[1.3.0]
|
||||
|
||||
A tokenizer of type `thai` that segments Thai text into words. This tokenizer
|
||||
uses the built-in Thai segmentation algorithm included with Java to divide
|
||||
up Thai text. Text in other languages in general will be treated the same
|
||||
as `standard`.
|
|
@ -241,8 +241,19 @@ document indexed.
|
|||
[float]
|
||||
=== JSONP
|
||||
|
||||
All REST APIs accept a `callback` parameter resulting in a
|
||||
http://en.wikipedia.org/wiki/JSONP[JSONP] result.
|
||||
By default JSONP responses are disabled by default. coming[1.3,Previously JSONP was enabled by default]
|
||||
|
||||
When enabled, all REST APIs accept a `callback` parameter
|
||||
resulting in a http://en.wikipedia.org/wiki/JSONP[JSONP] result. You can enable
|
||||
this behavior by adding the following to `config.yaml`:
|
||||
|
||||
http.jsonp.enable: true
|
||||
|
||||
Please note, when enabled, due to the architecture of Elasticsearch, this may pose
|
||||
a security risk. Under some circumstances, an attacker may be able to exfiltrate
|
||||
data in your Elasticsearch server if they're able to force your browser to make a
|
||||
JSONP request on your behalf (e.g. by including a <script> tag on an untrusted site
|
||||
with a legitimate query against a local Elasticsearch server).
|
||||
|
||||
[float]
|
||||
=== Request body in query string
|
||||
|
|
|
@ -3,23 +3,23 @@
|
|||
|
||||
The `nodes` command shows the cluster topology.
|
||||
|
||||
[source,shell]
|
||||
["source","sh",subs="attributes,callouts"]
|
||||
--------------------------------------------------
|
||||
% curl 192.168.56.10:9200/_cat/nodes
|
||||
SP4H 4727 192.168.56.30 9300 1.0.1 1.6.0_27 72.1gb 35.4 93.9mb 79 239.1mb 0.45 3.4h d m Boneyard
|
||||
_uhJ 5134 192.168.56.10 9300 1.0.1 1.6.0_27 72.1gb 33.3 93.9mb 85 239.1mb 0.06 3.4h d * Athena
|
||||
HfDp 4562 192.168.56.20 9300 1.0.1 1.6.0_27 72.2gb 74.5 93.9mb 83 239.1mb 0.12 3.4h d m Zarek
|
||||
SP4H 4727 192.168.56.30 9300 {version} {jdk} 72.1gb 35.4 93.9mb 79 239.1mb 0.45 3.4h d m Boneyard
|
||||
_uhJ 5134 192.168.56.10 9300 {version} {jdk} 72.1gb 33.3 93.9mb 85 239.1mb 0.06 3.4h d * Athena
|
||||
HfDp 4562 192.168.56.20 9300 {version} {jdk} 72.2gb 74.5 93.9mb 83 239.1mb 0.12 3.4h d m Zarek
|
||||
--------------------------------------------------
|
||||
|
||||
The first few columns tell you where your nodes live. For sanity it
|
||||
also tells you what version of ES and the JVM each one runs.
|
||||
|
||||
[source,shell]
|
||||
["source","sh",subs="attributes,callouts"]
|
||||
--------------------------------------------------
|
||||
nodeId pid ip port version jdk
|
||||
u2PZ 4234 192.168.56.30 9300 1.0.1 1.6.0_27
|
||||
URzf 5443 192.168.56.10 9300 1.0.1 1.6.0_27
|
||||
ActN 3806 192.168.56.20 9300 1.0.1 1.6.0_27
|
||||
u2PZ 4234 192.168.56.30 9300 {version} {jdk}
|
||||
URzf 5443 192.168.56.10 9300 {version} {jdk}
|
||||
ActN 3806 192.168.56.20 9300 {version} {jdk}
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
|
@ -65,20 +65,20 @@ by default. To have the headers appear in the output, use verbose
|
|||
mode (`v`). The header name will match the supplied value (e.g.,
|
||||
`pid` versus `p`). For example:
|
||||
|
||||
[source,shell]
|
||||
["source","sh",subs="attributes,callouts"]
|
||||
--------------------------------------------------
|
||||
% curl 192.168.56.10:9200/_cat/nodes?v\&h=id,ip,port,v,m
|
||||
id ip port version m
|
||||
pLSN 192.168.56.30 9300 1.0.1 m
|
||||
k0zy 192.168.56.10 9300 1.0.1 m
|
||||
6Tyi 192.168.56.20 9300 1.0.1 *
|
||||
pLSN 192.168.56.30 9300 {version} m
|
||||
k0zy 192.168.56.10 9300 {version} m
|
||||
6Tyi 192.168.56.20 9300 {version} *
|
||||
% curl 192.168.56.10:9200/_cat/nodes?h=id,ip,port,v,m
|
||||
pLSN 192.168.56.30 9300 1.0.1 m
|
||||
k0zy 192.168.56.10 9300 1.0.1 m
|
||||
6Tyi 192.168.56.20 9300 1.0.1 *
|
||||
pLSN 192.168.56.30 9300 {version} m
|
||||
k0zy 192.168.56.10 9300 {version} m
|
||||
6Tyi 192.168.56.20 9300 {version} *
|
||||
--------------------------------------------------
|
||||
|
||||
[cols="<,<,<,<,<",options="header",]
|
||||
[cols="<,<,<,<,<",options="header",subs="normal"]
|
||||
|=======================================================================
|
||||
|Header |Alias |Appear by Default |Description |Example
|
||||
|`id` |`nodeId` |No |Unique node ID |k0zy
|
||||
|
@ -86,7 +86,7 @@ k0zy 192.168.56.10 9300 1.0.1 m
|
|||
|`host` |`h` |Yes |Host name |n1
|
||||
|`ip` |`i` |Yes |IP address |127.0.1.1
|
||||
|`port` |`po` |No |Bound transport port |9300
|
||||
|`version` |`v` |No |Elasticsearch version |1.0.1
|
||||
|`version` |`v` |No |Elasticsearch version |{version}
|
||||
|`build` |`b` |No |Elasticsearch Build hash |5c03844
|
||||
|`jdk` |`j` |No |Running Java version |1.8.0
|
||||
|`disk.avail` |`d`, `disk`, `diskAvail` |No |Available disk space |1.8gb
|
||||
|
@ -179,4 +179,8 @@ operations |9
|
|||
|`segments.count` |`sc`, `segmentsCount` |No |Number of segments |4
|
||||
|`segments.memory` |`sm`, `segmentsMemory` |No |Memory used by
|
||||
segments |1.4kb
|
||||
|`segments.index_writer_memory` |`siwm`, `segmentsIndexWriterMemory` |No
|
||||
|Memory used by index writer |1.2kb
|
||||
|`segments.version_map_memory` |`svmm`, `segmentsVersionMapMemory` |No
|
||||
|Memory used by version map |1.0kb
|
||||
|=======================================================================
|
||||
|
|
|
@ -60,12 +60,13 @@ The cluster health API accepts the following request parameters:
|
|||
`wait_for_status`::
|
||||
One of `green`, `yellow` or `red`. Will wait (until
|
||||
the timeout provided) until the status of the cluster changes to the one
|
||||
provided. By default, will not wait for any status.
|
||||
provided or better, i.e. `green` > `yellow` > `red`. By default, will not
|
||||
wait for any status.
|
||||
|
||||
`wait_for_relocating_shards`::
|
||||
A number controlling to how many relocating
|
||||
shards to wait for. Usually will be `0` to indicate to wait till all
|
||||
relocation have happened. Defaults to not to wait.
|
||||
relocations have happened. Defaults to not wait.
|
||||
|
||||
`wait_for_nodes`::
|
||||
The request waits until the specified number `N` of
|
||||
|
|
|
@ -12,7 +12,7 @@ curl -XGET 'http://localhost:9200/_cluster/stats?human&pretty'
|
|||
--------------------------------------------------
|
||||
|
||||
Will return, for example:
|
||||
[source,js]
|
||||
["source","js",subs="attributes,callouts"]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"cluster_name": "elasticsearch",
|
||||
|
@ -82,7 +82,7 @@ Will return, for example:
|
|||
"client": 0
|
||||
},
|
||||
"versions": [
|
||||
"0.90.8"
|
||||
"{version}"
|
||||
],
|
||||
"os": {
|
||||
"available_processors": 4,
|
||||
|
|
|
@ -220,3 +220,13 @@ All the disable allocation settings have been deprecated in favour for
|
|||
|
||||
Logger values can also be updated by setting `logger.` prefix. More
|
||||
settings will be allowed to be updated.
|
||||
|
||||
[float]
|
||||
=== Field data circuit breaker
|
||||
|
||||
`indices.fielddata.breaker.limit`::
|
||||
See <<index-modules-fielddata>>
|
||||
|
||||
`indices.fielddata.breaker.overhead`::
|
||||
See <<index-modules-fielddata>>
|
||||
|
||||
|
|
|
@ -2,8 +2,26 @@
|
|||
== Bulk API
|
||||
|
||||
The bulk API makes it possible to perform many index/delete operations
|
||||
in a single API call. This can greatly increase the indexing speed. The
|
||||
REST API endpoint is `/_bulk`, and it expects the following JSON
|
||||
in a single API call. This can greatly increase the indexing speed.
|
||||
|
||||
.Client support for bulk requests
|
||||
*********************************************
|
||||
|
||||
Some of the officially supported clients provide helpers to assist with
|
||||
bulk requests and reindexing of documents from one index to another:
|
||||
|
||||
Perl::
|
||||
|
||||
See https://metacpan.org/pod/Search::Elasticsearch::Bulk[Search::Elasticsearch::Bulk]
|
||||
and https://metacpan.org/pod/Search::Elasticsearch::Scroll[Search::Elasticsearch::Scroll]
|
||||
|
||||
Python::
|
||||
|
||||
See http://elasticsearch-py.readthedocs.org/en/master/helpers.html[elasticsearch.helpers.*]
|
||||
|
||||
*********************************************
|
||||
|
||||
The REST API endpoint is `/_bulk`, and it expects the following JSON
|
||||
structure:
|
||||
|
||||
[source,js]
|
||||
|
@ -19,7 +37,7 @@ optional_source\n
|
|||
|
||||
*NOTE*: the final line of data must end with a newline character `\n`.
|
||||
|
||||
The possible actions are `index`, `create`, `delete` and `update`.
|
||||
The possible actions are `index`, `create`, `delete` and `update`.
|
||||
`index` and `create` expect a source on the next
|
||||
line, and have the same semantics as the `op_type` parameter to the
|
||||
standard index API (i.e. create will fail if a document with the same
|
||||
|
|
|
@ -272,7 +272,7 @@ parameter. For example:
|
|||
--------------------------------------------------
|
||||
$ curl -XPUT localhost:9200/twitter/tweet/1?timestamp=2009-11-15T14%3A12%3A12 -d '{
|
||||
"user" : "kimchy",
|
||||
"message" : "trying out Elasticsearch",
|
||||
"message" : "trying out Elasticsearch"
|
||||
}'
|
||||
--------------------------------------------------
|
||||
|
||||
|
|
|
@ -70,6 +70,45 @@ curl 'localhost:9200/test/type/_mget' -d '{
|
|||
}'
|
||||
--------------------------------------------------
|
||||
|
||||
[float]
|
||||
[[mget-type]]
|
||||
=== Optional Type
|
||||
|
||||
The mget API allows for `_type` to be optional. Set it to `_all` or leave it empty in order
|
||||
to fetch the first document matching the id across all types.
|
||||
|
||||
If you don't set the type and have many documents sharing the same `_id`, you will end up
|
||||
getting only the first matching document.
|
||||
|
||||
For example, if you have a document 1 within typeA and typeB then following request
|
||||
will give you back only the same document twice:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl 'localhost:9200/test/_mget' -d '{
|
||||
"ids" : ["1", "1"]
|
||||
}'
|
||||
--------------------------------------------------
|
||||
|
||||
You need in that case to explicitly set the `_type`:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET /test/_mget/
|
||||
{
|
||||
"docs" : [
|
||||
{
|
||||
"_type":"typeA",
|
||||
"_id" : "1"
|
||||
},
|
||||
{
|
||||
"_type":"typeB",
|
||||
"_id" : "1"
|
||||
}
|
||||
]
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
[float]
|
||||
[[mget-source-filtering]]
|
||||
=== Source filtering
|
||||
|
|
|
@ -19,8 +19,9 @@ retrieved either with a parameter in the url
|
|||
curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?fields=text,...'
|
||||
--------------------------------------------------
|
||||
|
||||
or adding by adding the requested fields in the request body (see
|
||||
example below).
|
||||
or by adding the requested fields in the request body (see
|
||||
example below). Fields can also be specified with wildcards
|
||||
in similar way to the <<query-dsl-multi-match-query,multi match query>> added[1.4.0].
|
||||
|
||||
[float]
|
||||
=== Return values
|
||||
|
@ -38,9 +39,11 @@ statistics are returned for all fields but no term statistics.
|
|||
* term payloads (`payloads` : true), as base64 encoded bytes
|
||||
|
||||
If the requested information wasn't stored in the index, it will be
|
||||
omitted without further warning. See <<mapping-types,type mapping>>
|
||||
computed on the fly if possible. See <<mapping-types,type mapping>>
|
||||
for how to configure your index to store term vectors.
|
||||
|
||||
added[1.4.0,The ability to computed term vectors on the fly is only available from 1.4.0 onwards (see below)]
|
||||
|
||||
[WARNING]
|
||||
======
|
||||
Start and end offsets assume UTF-16 encoding is being used. If you want to use
|
||||
|
@ -84,7 +87,7 @@ are therefore only useful as relative measures whereas the absolute
|
|||
numbers have no meaning in this context.
|
||||
|
||||
[float]
|
||||
=== Example
|
||||
=== Example 1
|
||||
|
||||
First, we create an index that stores term vectors, payloads etc. :
|
||||
|
||||
|
@ -222,3 +225,22 @@ Response:
|
|||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
[float]
|
||||
=== Example 2 added[1.4.0]
|
||||
|
||||
Additionally, term vectors which are not explicitly stored in the index are automatically
|
||||
computed on the fly. The following request returns all information and statistics for the
|
||||
fields in document `1`, even though the terms haven't been explicitly stored in the index.
|
||||
Note that for the field `text`, the terms are not re-generated.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?pretty=true' -d '{
|
||||
"fields" : ["text", "some_field_without_term_vectors"],
|
||||
"offsets" : true,
|
||||
"positions" : true,
|
||||
"term_statistics" : true,
|
||||
"field_statistics" : true
|
||||
}'
|
||||
--------------------------------------------------
|
||||
|
|
|
@ -109,6 +109,23 @@ curl -XPOST 'localhost:9200/test/type1/1/_update' -d '{
|
|||
If both `doc` and `script` is specified, then `doc` is ignored. Best is
|
||||
to put your field pairs of the partial document in the script itself.
|
||||
|
||||
By default if `doc` is specified then the document is always updated even
|
||||
if the merging process doesn't cause any changes. Specifying `detect_noop`
|
||||
as `true` will cause Elasticsearch to check if there are changes and, if
|
||||
there aren't, turn the update request into a noop. For example:
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XPOST 'localhost:9200/test/type1/1/_update' -d '{
|
||||
"doc" : {
|
||||
"name" : "new_name"
|
||||
},
|
||||
"detect_noop": true
|
||||
}'
|
||||
--------------------------------------------------
|
||||
|
||||
If `name` was `new_name` before the request was sent then the entire update
|
||||
request is ignored.
|
||||
|
||||
There is also support for `upsert`. If the document does
|
||||
not already exists, the content of the `upsert` element will be used to
|
||||
index the fresh doc:
|
||||
|
|
|
@ -66,7 +66,7 @@ Within an index/type, you can store as many documents as you want. Note that alt
|
|||
|
||||
An index can potentially store a large amount of data that can exceed the hardware limits of a single node. For example, a single index of a billion documents taking up 1TB of disk space may not fit on the disk of a single node or may be too slow to serve search requests from a single node alone.
|
||||
|
||||
To solve this problem, Elasticsearch provides the ability to subdivide your index into multiple pieces called shards. When you create an index, you can simply define the number of shards that you want. Each shard is in itself a fully-functional and independent "index" that can be hosted on any node in the cluster.
|
||||
To solve this problem, Elasticsearch provides the ability to subdivide your index into multiple pieces called shards. When you create an index, you can simply define the number of shards that you want. Each shard is in itself a fully-functional and independent "index" that can be hosted on any node in the cluster.
|
||||
|
||||
Sharding is important for two primary reasons:
|
||||
|
||||
|
@ -76,7 +76,7 @@ Sharding is important for two primary reasons:
|
|||
|
||||
The mechanics of how a shard is distributed and also how its documents are aggregated back into search requests are completely managed by Elasticsearch and is transparent to you as the user.
|
||||
|
||||
In a network/cloud environment where failures can be expected anytime, it is very useful and highly recommended to have a failover mechanism in case a shard/node somehow goes offline or disappears for whatever reason. To this end, Elasticsearch allows you to make one or more copies of your index's shards into what are called replica shards, or replicas for short.
|
||||
In a network/cloud environment where failures can be expected anytime, it is very useful and highly recommended to have a failover mechanism in case a shard/node somehow goes offline or disappears for whatever reason. To this end, Elasticsearch allows you to make one or more copies of your index's shards into what are called replica shards, or replicas for short.
|
||||
|
||||
Replication is important for two primary reasons:
|
||||
|
||||
|
@ -93,7 +93,7 @@ With that out of the way, let's get started with the fun part...
|
|||
|
||||
== Installation
|
||||
|
||||
Elasticsearch requires Java 7. Specifically as of this writing, it is recommended that you use the Oracle JDK version 1.7.0_55. Java installation varies from platform to platform so we won't go into those details here. Suffice to say, before you install Elasticsearch, please check your Java version first by running (and then install/upgrade accordingly if needed):
|
||||
Elasticsearch requires Java 7. Specifically as of this writing, it is recommended that you use the Oracle JDK version {jdk}. Java installation varies from platform to platform so we won't go into those details here. Suffice to say, before you install Elasticsearch, please check your Java version first by running (and then install/upgrade accordingly if needed):
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
|
@ -103,25 +103,25 @@ echo $JAVA_HOME
|
|||
|
||||
Once we have Java set up, we can then download and run Elasticsearch. The binaries are available from http://www.elasticsearch.org/download[`www.elasticsearch.org/download`] along with all the releases that have been made in the past. For each release, you have a choice among a zip, tar, DEB, or RPM package. For simplicity, let's use the tar package.
|
||||
|
||||
Let's download the Elasticsearch 1.1.1 tar as follows (Windows users should download the zip package):
|
||||
Let's download the Elasticsearch {version} tar as follows (Windows users should download the zip package):
|
||||
|
||||
[source,sh]
|
||||
["source","sh",subs="attributes,callouts"]
|
||||
--------------------------------------------------
|
||||
curl -L -O https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-1.1.1.tar.gz
|
||||
curl -L -O https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-{version}.tar.gz
|
||||
--------------------------------------------------
|
||||
|
||||
Then extract it as follows (Windows users should unzip the zip package):
|
||||
|
||||
[source,sh]
|
||||
["source","sh",subs="attributes,callouts"]
|
||||
--------------------------------------------------
|
||||
tar -xvf elasticsearch-1.1.1.tar.gz
|
||||
tar -xvf elasticsearch-{version}.tar.gz
|
||||
--------------------------------------------------
|
||||
|
||||
It will then create a bunch of files and folders in your current directory. We then go into the bin directory as follows:
|
||||
|
||||
[source,sh]
|
||||
["source","sh",subs="attributes,callouts"]
|
||||
--------------------------------------------------
|
||||
cd elasticsearch-1.1.1/bin
|
||||
cd elasticsearch-{version}/bin
|
||||
--------------------------------------------------
|
||||
|
||||
And now we are ready to start our node and single cluster (Windows users should run the elasticsearch.bat file):
|
||||
|
@ -133,10 +133,10 @@ And now we are ready to start our node and single cluster (Windows users should
|
|||
|
||||
If everything goes well, you should see a bunch of messages that look like below:
|
||||
|
||||
[source,sh]
|
||||
["source","sh",subs="attributes,callouts"]
|
||||
--------------------------------------------------
|
||||
./elasticsearch
|
||||
[2014-03-13 13:42:17,218][INFO ][node ] [New Goblin] version[1.1.1], pid[2085], build[5c03844/2014-02-25T15:52:53Z]
|
||||
[2014-03-13 13:42:17,218][INFO ][node ] [New Goblin] version[{version}], pid[2085], build[5c03844/2014-02-25T15:52:53Z]
|
||||
[2014-03-13 13:42:17,219][INFO ][node ] [New Goblin] initializing ...
|
||||
[2014-03-13 13:42:17,223][INFO ][plugins ] [New Goblin] loaded [], sites []
|
||||
[2014-03-13 13:42:19,831][INFO ][node ] [New Goblin] initialized
|
||||
|
@ -166,7 +166,7 @@ Also note the line marked http with information about the HTTP address (`192.168
|
|||
=== The REST API
|
||||
|
||||
Now that we have our node (and cluster) up and running, the next step is to understand how to communicate with it. Fortunately, Elasticsearch provides a very comprehensive and powerful REST API that you can use to interact with your cluster. Among the few things that can be done with the API are as follows:
|
||||
|
||||
|
||||
* Check your cluster, node, and index health, status, and statistics
|
||||
* Administer your cluster, node, and index data and metadata
|
||||
* Perform CRUD (Create, Read, Update, and Delete) and search operations against your indexes
|
||||
|
@ -174,15 +174,15 @@ Now that we have our node (and cluster) up and running, the next step is to unde
|
|||
|
||||
=== Cluster Health
|
||||
|
||||
Let's start with a basic health check, which we can use to see how our cluster is doing. We'll be using curl to do this but you can use any tool that allows you to make HTTP/REST calls. Let's assume that we are still on the same node where we started Elasticsearch on and open another command shell window.
|
||||
Let's start with a basic health check, which we can use to see how our cluster is doing. We'll be using curl to do this but you can use any tool that allows you to make HTTP/REST calls. Let's assume that we are still on the same node where we started Elasticsearch on and open another command shell window.
|
||||
|
||||
To check the cluster health, we will be using the http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/cat.html[`_cat` API]. Remember previously that our node HTTP endpoint is available at port `9200`:
|
||||
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl 'localhost:9200/_cat/health?v'
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
And the response:
|
||||
|
||||
[source,sh]
|
||||
|
@ -191,19 +191,19 @@ epoch timestamp cluster status node.total node.data shards pri relo i
|
|||
1394735289 14:28:09 elasticsearch green 1 1 0 0 0 0 0
|
||||
--------------------------------------------------
|
||||
|
||||
We can see that our cluster named "elasticsearch" is up with a green status.
|
||||
We can see that our cluster named "elasticsearch" is up with a green status.
|
||||
|
||||
Whenever we ask for the cluster health, we either get green, yellow, or red. Green means everything is good (cluster is fully functional), yellow means all data is available but some replicas are not yet allocated (cluster is fully functional), and red means some data is not available for whatever reason. Note that even if a cluster is red, it still is partially functional (i.e. it will continue to serve search requests from the available shards) but you will likely need to fix it ASAP since you have missing data.
|
||||
|
||||
|
||||
Also from the above response, we can see and total of 1 node and that we have 0 shards since we have no data in it yet. Note that since we we are using the default cluster name (elasticsearch) and since Elasticsearch uses multicast network discovery by default to find other nodes, it is possible that you could accidentally start up more than one node in your network and have them all join a single cluster. In this scenario, you may see more than 1 node in the above response.
|
||||
|
||||
|
||||
We can also get a list of nodes in our cluster as follows:
|
||||
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl 'localhost:9200/_cat/nodes?v'
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
And the response:
|
||||
|
||||
[source,sh]
|
||||
|
@ -214,16 +214,16 @@ mwubuntu1 127.0.1.1 8 4 0.00 d * New Goblin
|
|||
--------------------------------------------------
|
||||
|
||||
Here, we can see our one node named "New Goblin", which is the single node that is currently in our cluster.
|
||||
|
||||
|
||||
=== List All Indexes
|
||||
|
||||
Now let's take a peek at our indexes:
|
||||
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl 'localhost:9200/_cat/indices?v'
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
And the response:
|
||||
|
||||
[source,sh]
|
||||
|
@ -237,15 +237,15 @@ Which simply means we have no indexes yet in the cluster.
|
|||
=== Create an Index
|
||||
|
||||
Now let's create an index named "customer" and then list all the indexes again:
|
||||
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl -XPUT 'localhost:9200/customer?pretty'
|
||||
curl 'localhost:9200/_cat/indices?v'
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
The first command creates the index named "customer" using the PUT verb. We simply append `pretty` to the end of the call to tell it to pretty-print the JSON response (if any).
|
||||
|
||||
|
||||
And the response:
|
||||
|
||||
[source,sh]
|
||||
|
@ -261,7 +261,7 @@ yellow customer 5 1 0 0 495b 495b
|
|||
--------------------------------------------------
|
||||
|
||||
The results of the second command tells us that we now have 1 index named customer and it has 5 primary shards and 1 replica (the defaults) and it contains 0 documents in it.
|
||||
|
||||
|
||||
You might also notice that the customer index has a yellow health tagged to it. Recall from our previous discussion that yellow means that some replicas are not (yet) allocated. The reason this happens for this index is because Elasticsearch by default created one replica for this index. Since we only have one node running at the moment, that one replica cannot yet be allocated (for high availability) until a later point in time when another node joins the cluster. Once that replica gets allocated onto a second node, the health status for this index will turn to green.
|
||||
|
||||
=== Index and Query a Document
|
||||
|
@ -275,8 +275,8 @@ Our JSON document: { "name": "John Doe" }
|
|||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl -XPUT 'localhost:9200/customer/external/1?pretty' -d '
|
||||
{
|
||||
"name": "John Doe"
|
||||
{
|
||||
"name": "John Doe"
|
||||
}'
|
||||
--------------------------------------------------
|
||||
|
||||
|
@ -285,8 +285,8 @@ And the response:
|
|||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl -XPUT 'localhost:9200/customer/external/1?pretty' -d '
|
||||
{
|
||||
"name": "John Doe"
|
||||
{
|
||||
"name": "John Doe"
|
||||
}'
|
||||
{
|
||||
"_index" : "customer",
|
||||
|
@ -300,14 +300,14 @@ curl -XPUT 'localhost:9200/customer/external/1?pretty' -d '
|
|||
From the above, we can see that a new customer document was successfully created inside the customer index and the external type. The document also has an internal id of 1 which we specified at index time.
|
||||
|
||||
It is important to note that Elasticsearch does not require you to explicitly create an index first before you can index documents into it. In the previous example, Elasticsearch will automatically create the customer index if it didn't already exist beforehand.
|
||||
|
||||
|
||||
Let's now retrieve that document that we just indexed:
|
||||
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl -XGET 'localhost:9200/customer/external/1?pretty'
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
And the response:
|
||||
|
||||
[source,sh]
|
||||
|
@ -323,17 +323,17 @@ curl -XGET 'localhost:9200/customer/external/1?pretty'
|
|||
--------------------------------------------------
|
||||
|
||||
Nothing out of the ordinary here other than a field, `found`, stating that we found a document with the requested ID 1 and another field, `_source`, which returns the full JSON document that we indexed from the previous step.
|
||||
|
||||
|
||||
=== Delete an Index
|
||||
|
||||
Now let's delete the index that we just created and then list all the indexes again:
|
||||
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl -XDELETE 'localhost:9200/customer?pretty'
|
||||
curl 'localhost:9200/_cat/indices?v'
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
And the response:
|
||||
|
||||
[source,sh]
|
||||
|
@ -354,20 +354,20 @@ Before we move on, let's take a closer look again at some of the API commands th
|
|||
--------------------------------------------------
|
||||
curl -XPUT 'localhost:9200/customer'
|
||||
curl -XPUT 'localhost:9200/customer/external/1' -d '
|
||||
{
|
||||
"name": "John Doe"
|
||||
{
|
||||
"name": "John Doe"
|
||||
}'
|
||||
curl 'localhost:9200/customer/external/1'
|
||||
curl -XDELETE 'localhost:9200/customer'
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
If we study the above commands carefully, we can actually see a pattern of how we access data in Elasticsearch. That pattern can be summarized as follows:
|
||||
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl -<REST Verb> <Node>:<Port>/<Index>/<Type>/<ID>
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
This REST access pattern is pervasive throughout all the API commands that if you can simply remember it, you will have a good head start at mastering Elasticsearch.
|
||||
|
||||
== Modifying Your Data
|
||||
|
@ -382,8 +382,8 @@ We've previously seen how we can index a single document. Let's recall that comm
|
|||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl -XPUT 'localhost:9200/customer/external/1?pretty' -d '
|
||||
{
|
||||
"name": "John Doe"
|
||||
{
|
||||
"name": "John Doe"
|
||||
}'
|
||||
--------------------------------------------------
|
||||
|
||||
|
@ -392,8 +392,8 @@ Again, the above will index the specified document into the customer index, exte
|
|||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl -XPUT 'localhost:9200/customer/external/1?pretty' -d '
|
||||
{
|
||||
"name": "Jane Doe"
|
||||
{
|
||||
"name": "Jane Doe"
|
||||
}'
|
||||
--------------------------------------------------
|
||||
|
||||
|
@ -402,8 +402,8 @@ The above changes the name of the document with the ID of 1 from "John Doe" to "
|
|||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl -XPUT 'localhost:9200/customer/external/2?pretty' -d '
|
||||
{
|
||||
"name": "Jane Doe"
|
||||
{
|
||||
"name": "Jane Doe"
|
||||
}'
|
||||
--------------------------------------------------
|
||||
|
||||
|
@ -416,8 +416,8 @@ This example shows how to index a document without an explicit ID:
|
|||
[source,sh]
|
||||
--------------------------------------------------
|
||||
curl -XPOST 'localhost:9200/customer/external?pretty' -d '
|
||||
{
|
||||
"name": "Jane Doe"
|
||||
{
|
||||
"name": "Jane Doe"
|
||||
}'
|
||||
--------------------------------------------------
|
||||
|
||||
|
|
Before Width: | Height: | Size: 3.0 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 3.2 KiB After Width: | Height: | Size: 2.3 KiB |
Before Width: | Height: | Size: 3.3 KiB After Width: | Height: | Size: 2.0 KiB |
After Width: | Height: | Size: 205 B |
After Width: | Height: | Size: 920 B |
After Width: | Height: | Size: 928 B |
After Width: | Height: | Size: 198 B |
After Width: | Height: | Size: 1013 B |
|
@ -100,7 +100,7 @@ settings API.
|
|||
[[disk]]
|
||||
=== Disk-based Shard Allocation
|
||||
|
||||
coming[1.3.0] disk based shard allocation is enabled from version 1.3.0 onward
|
||||
added[1.3.0] disk based shard allocation is enabled from version 1.3.0 onward
|
||||
|
||||
Elasticsearch can be configured to prevent shard
|
||||
allocation on nodes depending on disk usage for the node. This
|
||||
|
|
|
@ -31,25 +31,3 @@ configured in the node configuration).
|
|||
`indices.cache.filter.size` can accept either a percentage value, like
|
||||
`30%`, or an exact value, like `512mb`.
|
||||
|
||||
[float]
|
||||
[[index-filter]]
|
||||
==== Index Filter Cache
|
||||
|
||||
A filter cache that exists on the index level (on each node). Generally,
|
||||
not recommended for use since its memory usage depends on which shards
|
||||
are allocated on each node and its hard to predict it. The types are:
|
||||
`resident`, `soft` and `weak`.
|
||||
|
||||
All types support the following settings:
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description
|
||||
|`index.cache.filter.max_size` |The max size (count, not byte size) of
|
||||
the cache (per search segment in a shard). Defaults to not set (`-1`),
|
||||
which is usually fine with `soft` cache and proper cacheable filters.
|
||||
|
||||
|`index.cache.filter.expire` |A time based setting that expires filters
|
||||
after a certain time of inactivity. Defaults to `-1`. For example, can
|
||||
be set to `5m` for a 5 minute expiry.
|
||||
|=======================================================================
|
||||
|
|
|
@ -144,21 +144,21 @@ Type name: `bloom`
|
|||
[TIP]
|
||||
==================================================
|
||||
|
||||
It can sometime make sense to disable bloom filters. For instance, if you are
|
||||
logging into an index per day, and you have thousands of indices, the bloom
|
||||
filters can take up a sizable amount of memory. For most queries you are only
|
||||
interested in recent indices, so you don't mind CRUD operations on older
|
||||
indices taking slightly longer.
|
||||
As of 1.4, the bloom filters are no longer loaded at search time by
|
||||
default: they consume RAM in proportion to the number of unique terms,
|
||||
which can quickly add up for certain use cases, and separate
|
||||
performance improvements have made the performance gains with bloom
|
||||
filters very small.
|
||||
|
||||
In these cases you can disable loading of the bloom filter on a per-index
|
||||
basis by updating the index settings:
|
||||
You can enable loading of the bloom filter at search time on a
|
||||
per-index basis by updating the index settings:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT /old_index/_settings?index.codec.bloom.load=false
|
||||
PUT /old_index/_settings?index.codec.bloom.load=true
|
||||
--------------------------------------------------
|
||||
|
||||
This setting, which defaults to `true`, can be updated on a live index. Note,
|
||||
This setting, which defaults to `false`, can be updated on a live index. Note,
|
||||
however, that changing the value will cause the index to be reopened, which
|
||||
will invalidate any existing caches.
|
||||
|
||||
|
|
|
@ -24,28 +24,63 @@ field data after a certain time of inactivity. Defaults to `-1`. For
|
|||
example, can be set to `5m` for a 5 minute expiry.
|
||||
|=======================================================================
|
||||
|
||||
[float]
|
||||
[[circuit-breaker]]
|
||||
=== Circuit Breaker
|
||||
|
||||
coming[1.4.0,Prior to 1.4.0 there was only a single circuit breaker for fielddata]
|
||||
|
||||
Elasticsearch contains multiple circuit breakers used to prevent operations from
|
||||
causing an OutOfMemoryError. Each breaker specifies a limit for how much memory
|
||||
it can use. Additionally, there is a parent-level breaker that specifies the
|
||||
total amount of memory that can be used across all breakers.
|
||||
|
||||
The parent-level breaker can be configured with the following setting:
|
||||
|
||||
`indices.breaker.total.limit`::
|
||||
Starting limit for overall parent breaker, defaults to 70% of JVM heap
|
||||
|
||||
All circuit breaker settings can be changed dynamically using the cluster update
|
||||
settings API.
|
||||
|
||||
[float]
|
||||
[[fielddata-circuit-breaker]]
|
||||
=== Field data circuit breaker
|
||||
==== Field data circuit breaker
|
||||
The field data circuit breaker allows Elasticsearch to estimate the amount of
|
||||
memory a field will required to be loaded into memory. It can then prevent the
|
||||
field data loading by raising an exception. By default the limit is configured
|
||||
to 60% of the maximum JVM heap. It can be configured with the following
|
||||
parameters:
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Setting |Description
|
||||
|`indices.fielddata.breaker.limit` |Maximum size of estimated field data
|
||||
to allow loading. Defaults to 60% of the maximum JVM heap.
|
||||
|`indices.fielddata.breaker.overhead` |A constant that all field data
|
||||
estimations are multiplied with to determine a final estimation. Defaults to
|
||||
1.03
|
||||
|=======================================================================
|
||||
`indices.breaker.fielddata.limit`::
|
||||
Limit for fielddata breaker, defaults to 60% of JVM heap
|
||||
|
||||
Both the `indices.fielddata.breaker.limit` and
|
||||
`indices.fielddata.breaker.overhead` can be changed dynamically using the
|
||||
cluster update settings API.
|
||||
`indices.breaker.fielddata.overhead`::
|
||||
A constant that all field data estimations are multiplied with to determine a
|
||||
final estimation. Defaults to 1.03
|
||||
|
||||
`indices.fielddata.breaker.limit`::
|
||||
deprecated[1.4.0,Replaced by `indices.breaker.fielddata.limit`]
|
||||
|
||||
`indices.fielddata.breaker.overhead`::
|
||||
deprecated[1.4.0,Replaced by `indices.breaker.fielddata.overhead`]
|
||||
|
||||
[float]
|
||||
[[request-circuit-breaker]]
|
||||
==== Request circuit breaker
|
||||
|
||||
coming[1.4.0]
|
||||
|
||||
The request circuit breaker allows Elasticsearch to prevent per-request data
|
||||
structures (for example, memory used for calculating aggregations during a
|
||||
request) from exceeding a certain amount of memory.
|
||||
|
||||
`indices.breaker.request.limit`::
|
||||
Limit for request breaker, defaults to 40% of JVM heap
|
||||
|
||||
`indices.breaker.request.overhead`::
|
||||
A constant that all request estimations are multiplied with to determine a
|
||||
final estimation. Defaults to 1
|
||||
|
||||
[float]
|
||||
[[fielddata-monitoring]]
|
||||
|
@ -73,10 +108,10 @@ data format.
|
|||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
tag: {
|
||||
type: "string",
|
||||
fielddata: {
|
||||
format: "fst"
|
||||
"tag": {
|
||||
"type": "string",
|
||||
"fielddata": {
|
||||
"format": "fst"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -173,10 +208,10 @@ It is possible to force field data to be loaded and cached eagerly through the
|
|||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
category: {
|
||||
type: "string",
|
||||
fielddata: {
|
||||
loading: "eager"
|
||||
"category": {
|
||||
"type": "string",
|
||||
"fielddata": {
|
||||
"loading": "eager"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -187,10 +222,10 @@ Global ordinals can also be eagerly loaded:
|
|||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
category: {
|
||||
type: "string",
|
||||
fielddata: {
|
||||
loading: "eager_global_ordinals"
|
||||
"category": {
|
||||
"type": "string",
|
||||
"fielddata": {
|
||||
"loading": "eager_global_ordinals"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -212,10 +247,10 @@ will return an error.
|
|||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
text: {
|
||||
type: "string",
|
||||
fielddata: {
|
||||
format: "disabled"
|
||||
"text": {
|
||||
"type": "string",
|
||||
"fielddata": {
|
||||
"format": "disabled"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -253,14 +288,14 @@ number of docs that the segment should contain with `min_segment_size`:
|
|||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
tag: {
|
||||
type: "string",
|
||||
fielddata: {
|
||||
filter: {
|
||||
frequency: {
|
||||
min: 0.001,
|
||||
max: 0.1,
|
||||
min_segment_size: 500
|
||||
"tag": {
|
||||
"type": "string",
|
||||
"fielddata": {
|
||||
"filter": {
|
||||
"frequency": {
|
||||
"min": 0.001,
|
||||
"max": 0.1,
|
||||
"min_segment_size": 500
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -280,13 +315,13 @@ expression which matches terms beginning with `#`:
|
|||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
tweet: {
|
||||
type: "string",
|
||||
analyzer: "whitespace"
|
||||
fielddata: {
|
||||
filter: {
|
||||
regex: {
|
||||
pattern: "^#.*"
|
||||
"tweet": {
|
||||
"type": "string",
|
||||
"analyzer": "whitespace"
|
||||
"fielddata": {
|
||||
"filter": {
|
||||
"regex": {
|
||||
"pattern": "^#.*"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -302,18 +337,18 @@ The `frequency` and `regex` filters can be combined:
|
|||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
tweet: {
|
||||
type: "string",
|
||||
analyzer: "whitespace"
|
||||
fielddata: {
|
||||
filter: {
|
||||
regex: {
|
||||
pattern: "^#.*",
|
||||
"tweet": {
|
||||
"type": "string",
|
||||
"analyzer": "whitespace"
|
||||
"fielddata": {
|
||||
"filter": {
|
||||
"regex": {
|
||||
"pattern": "^#.*",
|
||||
},
|
||||
frequency: {
|
||||
min: 0.001,
|
||||
max: 0.1,
|
||||
min_segment_size: 500
|
||||
"frequency": {
|
||||
"min": 0.001,
|
||||
"max": 0.1,
|
||||
"min_segment_size": 500
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -193,25 +193,14 @@ scheduler supports this setting:
|
|||
|
||||
`index.merge.scheduler.max_thread_count`::
|
||||
|
||||
The maximum number of concurrent merge threads that may run at once. Defaults
|
||||
to `1` which works best with spinning-magnets disks. If you are using
|
||||
a good solid-state disk (SSD) instead then try setting this to `3`.
|
||||
The maximum number of threads that may be merging at once. Defaults to
|
||||
`Math.max(1, Math.min(3, Runtime.getRuntime().availableProcessors() / 2))`
|
||||
which works well for a good solid-state-disk (SSD). If your index is on
|
||||
spinning platter drives instead, decrease this to 1.
|
||||
|
||||
[float]
|
||||
==== SerialMergeScheduler
|
||||
|
||||
A merge scheduler that simply does each merge sequentially using the
|
||||
calling thread (blocking the operations that triggered the merge or the
|
||||
index operation). This merge scheduler has a merge thread pool that
|
||||
explicitly schedules merges, and it makes sure that merges are serial
|
||||
within a shard, yet concurrent across multiple shards.
|
||||
|
||||
The scheduler supports the following settings:
|
||||
|
||||
`index.merge.scheduler.max_merge_at_once`::
|
||||
|
||||
The maximum number of merges a single merge run performs. This setting prevents
|
||||
executing unlimited amount of merges in a loop until another shards has a
|
||||
chance to get a merge thread from the pool. If this limit is reached the
|
||||
merge thread returns to the pool and continues once the the call to a single
|
||||
shards is executed. The default is `5`
|
||||
This is accepted for backwards compatibility, but just uses
|
||||
ConcurrentMergeScheduler with index.merge.scheduler.max_thread_count
|
||||
set to 1 so that only 1 merge may run at a time.
|
||||
|
|
|
@ -57,8 +57,8 @@ using the index update settings API dynamically.
|
|||
File system based storage is the default storage used. There are
|
||||
different implementations or _storage types_. The best one for the
|
||||
operating environment will be automatically chosen: `mmapfs` on
|
||||
Solaris/Linux/Windows 64bit, `simplefs` on Windows 32bit, and
|
||||
`niofs` for the rest.
|
||||
Windows 64bit, `simplefs` on Windows 32bit, and `default`
|
||||
(hybrid `niofs` and `mmapfs`) for the rest.
|
||||
|
||||
This can be overridden for all indices by adding this to the
|
||||
`config/elasticsearch.yml` file:
|
||||
|
@ -72,12 +72,11 @@ It can also be set on a per-index basis at index creation time:
|
|||
|
||||
[source,json]
|
||||
---------------------------------
|
||||
curl -XPUT localhost:9200/my_index
|
||||
{
|
||||
curl -XPUT localhost:9200/my_index -d '{
|
||||
"settings": {
|
||||
"index.store.type": "niofs"
|
||||
}
|
||||
}
|
||||
}';
|
||||
---------------------------------
|
||||
|
||||
The following sections lists all the different storage types supported.
|
||||
|
@ -112,6 +111,17 @@ process equal to the size of the file being mapped. Before using this
|
|||
class, be sure your have plenty of virtual address space.
|
||||
See <<vm-max-map-count>>
|
||||
|
||||
[[default_fs]]
|
||||
[float]
|
||||
==== Hybrid MMap / NIO FS added[1.3.0]
|
||||
|
||||
The `default` type stores the shard index on the file system depending on
|
||||
the file type by mapping a file into memory (mmap) or using Java NIO. Currently
|
||||
only the Lucene term dictionary and doc values files are memory mapped to reduce
|
||||
the impact on the operating system. All other files are opened using Lucene `NIOFSDirectory`.
|
||||
Address space settings (<<vm-max-map-count>>) might also apply if your term
|
||||
dictionaries are large.
|
||||
|
||||
[float]
|
||||
[[store-memory]]
|
||||
=== Memory
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
[[elasticsearch-reference]]
|
||||
= Reference
|
||||
|
||||
:version: 1.2.0
|
||||
:branch: 1.2
|
||||
:jdk: 1.7.0_60
|
||||
|
||||
include::getting-started.asciidoc[]
|
||||
|
||||
include::setup.asciidoc[]
|
||||
|
|
|
@ -14,6 +14,30 @@ $ curl -XPOST 'http://localhost:9200/twitter/_flush'
|
|||
--------------------------------------------------
|
||||
|
||||
[float]
|
||||
[[flush-parameters]]
|
||||
=== Request Parameters
|
||||
|
||||
The flush API accepts the following request parameters:
|
||||
|
||||
[horizontal]
|
||||
`wait_if_ongoing`:: If set to `true` the flush operation will block until the
|
||||
flush can be executed if another flush operation is already executing.
|
||||
The default is `false` and will cause an exception to be thrown on
|
||||
the shard level if another flush operation is already running. coming[1.4.0]
|
||||
|
||||
`full`:: If set to `true` a new index writer is created and settings that have
|
||||
been changed related to the index writer will be refreshed. Note: if a full flush
|
||||
is required for a setting to take effect this will be part of the settings update
|
||||
process and it not required to be executed by the user.
|
||||
(This setting can be considered as internal)
|
||||
|
||||
`force`:: Whether a flush should be forced even if it is not necessarily needed ie.
|
||||
if no changes will be committed to the index. This is useful if transaction log IDs
|
||||
should be incremented even if no uncommitted changes are present.
|
||||
(This setting can be considered as internal)
|
||||
|
||||
[float]
|
||||
[[flush-multi-index]]
|
||||
=== Multi Index
|
||||
|
||||
The flush API can be applied to more than one index with a single call,
|
||||
|
|
|
@ -75,3 +75,4 @@ include::mapping/conf-mappings.asciidoc[]
|
|||
|
||||
include::mapping/meta.asciidoc[]
|
||||
|
||||
include::mapping/transform.asciidoc[]
|
||||
|
|
|
@ -21,6 +21,8 @@ include::fields/boost-field.asciidoc[]
|
|||
|
||||
include::fields/parent-field.asciidoc[]
|
||||
|
||||
include::fields/field-names-field.asciidoc[]
|
||||
|
||||
include::fields/routing-field.asciidoc[]
|
||||
|
||||
include::fields/index-field.asciidoc[]
|
||||
|
|
|
@ -68,3 +68,5 @@ any field the document:
|
|||
<1> The original query, now wrapped in a `function_score` query.
|
||||
<2> This function returns the value in `my_boost_field`, which is then
|
||||
multiplied by the query `_score` for each document.
|
||||
|
||||
Note, that `field_value_factor` is a 1.2.x feature.
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
[[mapping-field-names-field]]
|
||||
=== `_field_names`
|
||||
|
||||
added[1.3.0]
|
||||
|
||||
The `_field_names` field indexes the field names of a document, which can later
|
||||
be used to search for documents based on the fields that they contain typically
|
||||
using the `exists` and `missing` filters.
|
||||
|
||||
`_field_names` is indexed by default for indices that have been created after
|
||||
Elasticsearch 1.3.0.
|
|
@ -56,7 +56,7 @@ Will cause `2009-11-15T14:12:12` to be used as the timestamp value for:
|
|||
}
|
||||
--------------------------------------------------
|
||||
|
||||
Note, using `path` without explicit timestamp value provided require an
|
||||
Note, using `path` without explicit timestamp value provided requires an
|
||||
additional (though quite fast) parsing phase.
|
||||
|
||||
[float]
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
[[mapping-transform]]
|
||||
== Transform
|
||||
added[1.3.0]
|
||||
|
||||
The document can be transformed before it is indexed by registering a
|
||||
script in the `transform` element of the mapping. The result of the
|
||||
transform is indexed but the original source is stored in the `_source`
|
||||
field. Example:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"example" : {
|
||||
"transform" : {
|
||||
"script" : "if (ctx._source['title']?.startsWith('t')) ctx._source['suggest'] = ctx._source['content']",
|
||||
"params" : {
|
||||
"variable" : "not used but an example anyway"
|
||||
},
|
||||
"lang": "groovy"
|
||||
},
|
||||
"properties": {
|
||||
"title": { "type": "string" },
|
||||
"content": { "type": "string" },
|
||||
"suggest": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
Its also possible to specify multiple transforms:
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"example" : {
|
||||
"transform" : [
|
||||
{"script": "ctx._source['suggest'] = ctx._source['content']"}
|
||||
{"script": "ctx._source['foo'] = ctx._source['bar'];"}
|
||||
]
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
Because the result isn't stored in the source it can't normally be fetched by
|
||||
source filtering. It can be highlighted if it is marked as stored.
|
||||
|
||||
=== Get Transformed
|
||||
The get endpoint will retransform the source if the `_source_transform`
|
||||
parameter is set. Example:
|
||||
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
curl -XGET "http://localhost:9200/test/example/3?pretty&_source_transform"
|
||||
--------------------------------------------------
|
||||
|
||||
The transform is performed before any source filtering but it is mostly
|
||||
designed to make it easy to see what was passed to the index for debugging.
|
||||
|
||||
=== Immutable Transformation
|
||||
Once configured the transform script cannot be modified. This is not
|
||||
because that is technically impossible but instead because madness lies
|
||||
down that road.
|
|
@ -1,46 +1,87 @@
|
|||
[[mapping-nested-type]]
|
||||
=== Nested Type
|
||||
|
||||
Nested objects/documents allow to map certain sections in the document
|
||||
indexed as nested allowing to query them as if they are separate docs
|
||||
joining with the parent owning doc.
|
||||
|
||||
One of the problems when indexing inner objects that occur several times
|
||||
in a doc is that "cross object" search match will occur, for example:
|
||||
The `nested` type works like the <<mapping-object-type,`object` type>> except
|
||||
that an array of `objects` is flattened, while an array of `nested` objects
|
||||
allows each object to be queried independently. To explain, consider this
|
||||
document:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"obj1" : [
|
||||
"group" : "fans",
|
||||
"user" : [
|
||||
{
|
||||
"name" : "blue",
|
||||
"count" : 4
|
||||
"first" : "John",
|
||||
"last" : "Smith"
|
||||
},
|
||||
{
|
||||
"name" : "green",
|
||||
"count" : 6
|
||||
}
|
||||
"first" : "Alice",
|
||||
"last" : "White"
|
||||
},
|
||||
]
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
Searching for name set to blue and count higher than 5 will match the
|
||||
doc, because in the first element the name matches blue, and in the
|
||||
second element, count matches "higher than 5".
|
||||
If the `user` field is of type `object`, this document would be indexed
|
||||
internally something like this:
|
||||
|
||||
Nested mapping allows mapping certain inner objects (usually multi
|
||||
instance ones), for example:
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"group" : "fans",
|
||||
"user.first" : [ "alice", "john" ],
|
||||
"user.last" : [ "smith", "white" ]
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
The `first` and `last` fields are flattened, and the association between
|
||||
`alice` and `white` is lost. This document would incorrectly match a query
|
||||
for `alice AND smith`.
|
||||
|
||||
If the `user` field is of type `nested`, each object is indexed as a separate
|
||||
document, something like this:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{ <1>
|
||||
"user.first" : "alice",
|
||||
"user.last" : "white"
|
||||
}
|
||||
{ <1>
|
||||
"user.first" : "john",
|
||||
"user.last" : "smith"
|
||||
}
|
||||
{ <2>
|
||||
"group" : "fans"
|
||||
}
|
||||
--------------------------------------------------
|
||||
<1> Hidden nested documents.
|
||||
<2> Visible ``parent'' document.
|
||||
|
||||
By keeping each nested object separate, the association between the
|
||||
`user.first` and `user.last` fields is maintained. The query for `alice AND
|
||||
smith` would *not* match this document.
|
||||
|
||||
Searching on nested docs can be done using either the
|
||||
<<query-dsl-nested-query,nested query>> or
|
||||
<<query-dsl-nested-filter,nested filter>>.
|
||||
|
||||
==== Mapping
|
||||
|
||||
The mapping for `nested` fields is the same as `object` fields, except that it
|
||||
uses type `nested`:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"type1" : {
|
||||
"properties" : {
|
||||
"obj1" : {
|
||||
"users" : {
|
||||
"type" : "nested",
|
||||
"properties": {
|
||||
"name" : {"type": "string", "index": "not_analyzed"},
|
||||
"count" : {"type": "integer"}
|
||||
"first" : {"type": "string" },
|
||||
"last" : {"type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -48,26 +89,60 @@ instance ones), for example:
|
|||
}
|
||||
--------------------------------------------------
|
||||
|
||||
The above will cause all `obj1` to be indexed as a nested doc. The
|
||||
mapping is similar in nature to setting `type` to `object`, except that
|
||||
it's `nested`. Nested object fields can be defined explicitly as in the
|
||||
example above or added dynamically in the same way as for the root object.
|
||||
NOTE: changing an `object` type to `nested` type requires reindexing.
|
||||
|
||||
Note: changing an object type to nested type requires reindexing.
|
||||
You may want to index inner objects both as `nested` fields *and* as flattened
|
||||
`object` fields, eg for highlighting. This can be achieved by setting
|
||||
`include_in_parent` to `true`:
|
||||
|
||||
The `nested` object fields can also be automatically added to the
|
||||
immediate parent by setting `include_in_parent` to true, and also
|
||||
included in the root object by setting `include_in_root` to true.
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"type1" : {
|
||||
"properties" : {
|
||||
"users" : {
|
||||
"type" : "nested",
|
||||
"include_in_parent": true,
|
||||
"properties": {
|
||||
"first" : {"type": "string" },
|
||||
"last" : {"type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
Nested docs will also automatically use the root doc `_all` field.
|
||||
The result of indexing our example document would be something like this:
|
||||
|
||||
Searching on nested docs can be done using either the
|
||||
<<query-dsl-nested-query,nested query>> or
|
||||
<<query-dsl-nested-filter,nested filter>>.
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{ <1>
|
||||
"user.first" : "alice",
|
||||
"user.last" : "white"
|
||||
}
|
||||
{ <1>
|
||||
"user.first" : "john",
|
||||
"user.last" : "smith"
|
||||
}
|
||||
{ <2>
|
||||
"group" : "fans",
|
||||
"user.first" : [ "alice", "john" ],
|
||||
"user.last" : [ "smith", "white" ]
|
||||
}
|
||||
--------------------------------------------------
|
||||
<1> Hidden nested documents.
|
||||
<2> Visible ``parent'' document.
|
||||
|
||||
[float]
|
||||
==== Internal Implementation
|
||||
|
||||
Nested fields may contain other nested fields. The `include_in_parent` object
|
||||
refers to the direct parent of the field, while the `include_in_root`
|
||||
parameter refers only to the topmost ``root'' object or document.
|
||||
|
||||
Nested docs will automatically use the root doc `_all` field only.
|
||||
|
||||
.Internal Implementation
|
||||
*********************************************
|
||||
Internally, nested objects are indexed as additional documents, but,
|
||||
since they can be guaranteed to be indexed within the same "block", it
|
||||
allows for extremely fast joining with parent docs.
|
||||
|
@ -84,3 +159,4 @@ the `nested` query scope.
|
|||
|
||||
The `_source` field is always associated with the parent document and
|
||||
because of that field values via the source can be fetched for nested object.
|
||||
*********************************************
|
||||
|
|
|
@ -1,16 +1,13 @@
|
|||
[[mapping-root-object-type]]
|
||||
=== Root Object Type
|
||||
|
||||
The root object mapping is an
|
||||
<<mapping-object-type,object type mapping>> that
|
||||
maps the root object (the type itself). On top of all the different
|
||||
mappings that can be set using the
|
||||
<<mapping-object-type,object type mapping>>, it
|
||||
allows for additional, type level mapping definitions.
|
||||
The root object mapping is an <<mapping-object-type,object type mapping>> that
|
||||
maps the root object (the type itself). It supports all of the different
|
||||
mappings that can be set using the <<mapping-object-type,object type mapping>>.
|
||||
|
||||
The root object mapping allows to index a JSON document that either
|
||||
starts with the actual mapping type, or only contains its fields. For
|
||||
example, the following `tweet` JSON can be indexed:
|
||||
The root object mapping allows to index a JSON document that only contains its
|
||||
fields. For example, the following `tweet` JSON can be indexed without
|
||||
specifying the `tweet` type in the document itself:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
@ -19,20 +16,6 @@ example, the following `tweet` JSON can be indexed:
|
|||
}
|
||||
--------------------------------------------------
|
||||
|
||||
But, also the following JSON can be indexed:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"tweet" : {
|
||||
"message" : "This is a tweet!"
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
Out of the two, it is preferable to use the document *without* the type
|
||||
explicitly set.
|
||||
|
||||
[float]
|
||||
==== Index / Search Analyzers
|
||||
|
||||
|
|
|
@ -37,12 +37,12 @@ depending on the shard the current document resides in.
|
|||
|
||||
`_index.numDocs()`::
|
||||
|
||||
Number of documents in shard.
|
||||
|
||||
Number of documents in shard.
|
||||
|
||||
`_index.maxDoc()`::
|
||||
|
||||
Maximal document number in shard.
|
||||
|
||||
|
||||
`_index.numDeletedDocs()`::
|
||||
|
||||
Number of deleted documents in shard.
|
||||
|
@ -62,7 +62,7 @@ Field statistics can be accessed with a subscript operator like this:
|
|||
`_index['FIELD'].sumttf()`::
|
||||
|
||||
Sum of `ttf` over all terms that appear in field `FIELD` in all documents.
|
||||
|
||||
|
||||
`_index['FIELD'].sumdf()`::
|
||||
|
||||
The sum of `df` s over all terms that appear in field `FIELD` in all
|
||||
|
@ -77,7 +77,7 @@ The number of terms in a field cannot be accessed using the `_index` variable. S
|
|||
=== Term statistics:
|
||||
|
||||
Term statistics for a field can be accessed with a subscript operator like
|
||||
this: `_index['FIELD']['TERM']`. This will never return null, even if term or field does not exist.
|
||||
this: `_index['FIELD']['TERM']`. This will never return null, even if term or field does not exist.
|
||||
If you do not need the term frequency, call `_index['FIELD'].get('TERM', 0)`
|
||||
to avoid uneccesary initialization of the frequencies. The flag will have only
|
||||
affect is your set the `index_options` to `docs` (see <<mapping-core-types, mapping documentation>>).
|
||||
|
@ -162,11 +162,11 @@ Positions can be accessed with an iterator that returns an object
|
|||
|
||||
Example: sums up all payloads for the term `foo`.
|
||||
|
||||
[source,mvel]
|
||||
[source,groovy]
|
||||
---------------------------------------------------------
|
||||
termInfo = _index['my_field'].get('foo',_PAYLOADS);
|
||||
score = 0;
|
||||
for (pos : termInfo) {
|
||||
for (pos in termInfo) {
|
||||
score = score + pos.payloadAsInt(0);
|
||||
}
|
||||
return score;
|
||||
|
@ -181,4 +181,3 @@ The `_index` variable can only be used to gather statistics for single terms. If
|
|||
https://lucene.apache.org/core/4_0_0/core/org/apache/lucene/index/Fields.html[Fields]
|
||||
instance. This object can then be used as described in https://lucene.apache.org/core/4_0_0/core/org/apache/lucene/index/Fields.html[lucene doc] to iterate over fields and then for each field iterate over each term in the field.
|
||||
The method will return null if the term vectors were not stored.
|
||||
|
||||
|
|
|
@ -75,7 +75,7 @@ configure the election to handle cases of slow or congested networks
|
|||
(higher values assure less chance of failure). Once a node joins, it
|
||||
will send a join request to the master (`discovery.zen.join_timeout`)
|
||||
with a timeout defaulting at 20 times the ping timeout.
|
||||
coming[1.3.0,Previously defaulted to 10 times the ping timeout].
|
||||
added[1.3.0,Previously defaulted to 10 times the ping timeout].
|
||||
|
||||
Nodes can be excluded from becoming a master by setting `node.master` to
|
||||
`false`. Note, once a node is a client node (`node.client` set to
|
||||
|
|
|
@ -42,14 +42,14 @@ once all `gateway.recover_after...nodes` conditions are met.
|
|||
|
||||
The `gateway.expected_nodes` allows to set how many data and master
|
||||
eligible nodes are expected to be in the cluster, and once met, the
|
||||
`recover_after_time` is ignored and recovery starts. The
|
||||
`gateway.expected_data_nodes` and `gateway.expected_master_nodes`
|
||||
`gateway.recover_after_time` is ignored and recovery starts.
|
||||
Setting `gateway.expected_nodes` also defaults `gateway.recovery_after_time` to `5m` added[1.3.0, before `expected_nodes`
|
||||
required `recovery_after_time` to be set]. The `gateway.expected_data_nodes` and `gateway.expected_master_nodes`
|
||||
settings are also supported. For example setting:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
gateway:
|
||||
recover_after_nodes: 1
|
||||
recover_after_time: 5m
|
||||
expected_nodes: 2
|
||||
--------------------------------------------------
|
||||
|
|
|
@ -18,9 +18,8 @@ For example:
|
|||
[source,js]
|
||||
--------------------------------------------------
|
||||
gateway:
|
||||
recover_after_nodes: 1
|
||||
recover_after_time: 5m
|
||||
expected_nodes: 2
|
||||
recover_after_nodes: 3
|
||||
expected_nodes: 5
|
||||
--------------------------------------------------
|
||||
|
||||
[float]
|
||||
|
|
|
@ -42,7 +42,10 @@ i.e. whether a browser on another origin can do requests to
|
|||
Elasticsearch. Defaults to `true`.
|
||||
|
||||
|`http.cors.allow-origin` |Which origins to allow. Defaults to `*`,
|
||||
i.e. any origin.
|
||||
i.e. any origin. If you prepend and append a `/` to the value, this will
|
||||
be treated as a regular expression, allowing you to support HTTP and HTTPs.
|
||||
for example using `/https?:\/\/localhost(:[0-9]+)?/` would return the
|
||||
request header appropriately in both cases.
|
||||
|
||||
|`http.cors.max-age` |Browsers send a "preflight" OPTIONS-request to
|
||||
determine CORS settings. `max-age` defines how long the result should
|
||||
|
|
|
@ -191,6 +191,9 @@ You can disable that check using `plugins.check_lucene: false`.
|
|||
* https://github.com/elasticsearch/elasticsearch-cloud-azure[Azure Cloud Plugin] - Azure discovery
|
||||
* https://github.com/elasticsearch/elasticsearch-cloud-gce[Google Compute Engine Cloud Plugin] - GCE discovery
|
||||
|
||||
.Supported by the community
|
||||
* https://github.com/shikhar/eskka[eskka Discovery Plugin] (by Shikhar Bhushan)
|
||||
|
||||
[float]
|
||||
[[river]]
|
||||
==== River Plugins
|
||||
|
@ -225,6 +228,8 @@ You can disable that check using `plugins.check_lucene: false`.
|
|||
* https://github.com/plombard/SubversionRiver[Subversion River Plugin] (by Pascal Lombard)
|
||||
* https://github.com/kzwang/elasticsearch-river-dynamodb[DynamoDB River Plugin] (by Kevin Wang)
|
||||
* https://github.com/salyh/elasticsearch-river-imap[IMAP/POP3 Email River Plugin] (by Hendrik Saly)
|
||||
* https://github.com/codelibs/elasticsearch-river-web[Web River Plugin] (by CodeLibs Project)
|
||||
* https://github.com/eea/eea.elasticsearch.river.rdf[EEA ElasticSearch RDF River Plugin] (by the European Environment Agency)
|
||||
|
||||
[float]
|
||||
[[transport]]
|
||||
|
@ -298,4 +303,6 @@ You can disable that check using `plugins.check_lucene: false`.
|
|||
* https://github.com/kzwang/elasticsearch-image[Elasticsearch Image Plugin] (by Kevin Wang)
|
||||
* https://github.com/wikimedia/search-highlighter[Elasticsearch Experimental Highlighter] (by Wikimedia Foundation/Nik Everett)
|
||||
* https://github.com/salyh/elasticsearch-security-plugin[Elasticsearch Security Plugin] (by Hendrik Saly)
|
||||
* https://github.com/codelibs/elasticsearch-taste[Elasticsearch Taste Plugin] (by CodeLibs Project)
|
||||
* http://siren.solutions/siren/downloads/[Elasticsearch SIREn Plugin]: Nested data search (by SIREn Solutions)
|
||||
|
||||
|
|
|
@ -6,28 +6,32 @@ expressions. For example, scripts can be used to return "script fields"
|
|||
as part of a search request, or can be used to evaluate a custom score
|
||||
for a query and so on.
|
||||
|
||||
The scripting module uses by default http://mvel.codehaus.org/[mvel] as
|
||||
the scripting language with some extensions. mvel is used since it is
|
||||
extremely fast and very simple to use, and in most cases, simple
|
||||
expressions are needed (for example, mathematical equations).
|
||||
deprecated[1.3.0,Mvel has been deprecated and will be removed in 1.4.0]
|
||||
|
||||
added[1.3.0,Groovy scripting support]
|
||||
|
||||
The scripting module uses by default http://groovy.codehaus.org/[groovy]
|
||||
(previously http://mvel.codehaus.org/[mvel] in 1.3.x and earlier) as the
|
||||
scripting language with some extensions. Groovy is used since it is extremely
|
||||
fast and very simple to use.
|
||||
|
||||
Additional `lang` plugins are provided to allow to execute scripts in
|
||||
different languages. Currently supported plugins are `lang-javascript`
|
||||
for JavaScript, `lang-groovy` for Groovy, and `lang-python` for Python.
|
||||
for JavaScript, `lang-mvel` for Mvel, and `lang-python` for Python.
|
||||
All places where a `script` parameter can be used, a `lang` parameter
|
||||
(on the same level) can be provided to define the language of the
|
||||
script. The `lang` options are `mvel`, `js`, `groovy`, `python`, and
|
||||
`native`.
|
||||
script. The `lang` options are `groovy`, `js`, `mvel`, `python`,
|
||||
`expression` and `native`.
|
||||
|
||||
added[1.2.0, Dynamic scripting is disabled by default since version 1.2.0]
|
||||
added[1.2.0, Dynamic scripting is disabled for non-sandboxed languages by default since version 1.2.0]
|
||||
|
||||
To increase security, Elasticsearch does not allow you to specify scripts with a
|
||||
request. Instead, scripts must be placed in the `scripts` directory inside the
|
||||
configuration directory (the directory where elasticsearch.yml is). Scripts
|
||||
placed into this directory will automatically be picked up and be available to
|
||||
be used. Once a script has been placed in this directory, it can be referenced
|
||||
by name. For example, a script called `calculate-score.mvel` can be referenced
|
||||
in a request like this:
|
||||
To increase security, Elasticsearch does not allow you to specify scripts for
|
||||
non-sandboxed languages with a request. Instead, scripts must be placed in the
|
||||
`scripts` directory inside the configuration directory (the directory where
|
||||
elasticsearch.yml is). Scripts placed into this directory will automatically be
|
||||
picked up and be available to be used. Once a script has been placed in this
|
||||
directory, it can be referenced by name. For example, a script called
|
||||
`calculate-score.groovy` can be referenced in a request like this:
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
|
@ -36,13 +40,13 @@ config
|
|||
├── elasticsearch.yml
|
||||
├── logging.yml
|
||||
└── scripts
|
||||
└── calculate-score.mvel
|
||||
└── calculate-score.groovy
|
||||
--------------------------------------------------
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
$ cat config/scripts/calculate-score.mvel
|
||||
Math.log(_score * 2) + my_modifier
|
||||
$ cat config/scripts/calculate-score.groovy
|
||||
log(_score * 2) + my_modifier
|
||||
--------------------------------------------------
|
||||
|
||||
[source,js]
|
||||
|
@ -76,20 +80,92 @@ a script placed under `config/scripts/group1/group2/test.py` will be
|
|||
named `group1_group2_test`.
|
||||
|
||||
[float]
|
||||
=== Default Scripting Language
|
||||
=== Indexed Scripts
|
||||
If dynamic scripting is enabled, Elasticsearch allows you to store scripts
|
||||
in an internal index known as `.scripts` and reference them by id. There are
|
||||
REST endpoints to manage indexed scripts as follows:
|
||||
|
||||
Requests to the scripts endpoint look like :
|
||||
[source,js]
|
||||
-----------------------------------
|
||||
/_scripts/{lang}/{id}
|
||||
-----------------------------------
|
||||
Where the `lang` part is the language the script is in and the `id` part is the id
|
||||
of the script. In the `.scripts` index the type of the document will be set to the `lang`.
|
||||
|
||||
|
||||
[source,js]
|
||||
-----------------------------------
|
||||
curl -XPOST localhost:9200/_scripts/groovy/indexedCalculateScore -d '{
|
||||
"script": "log(_score * 2) + my_modifier"
|
||||
}'
|
||||
-----------------------------------
|
||||
|
||||
This will create a document with id: `indexedCalculateScore` and type: `groovy` in the
|
||||
`.scripts` index. The type of the document is the language used by the script.
|
||||
|
||||
This script can be accessed at query time by appending `_id` to
|
||||
the script parameter and passing the script id. So `script` becomes `script_id`.:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XPOST localhost:9200/_search -d '{
|
||||
"query": {
|
||||
"function_score": {
|
||||
"query": {
|
||||
"match": {
|
||||
"body": "foo"
|
||||
}
|
||||
},
|
||||
"functions": [
|
||||
{
|
||||
"script_score": {
|
||||
"script_id": "indexedCalculateScore",
|
||||
"lang" : "groovy",
|
||||
"params": {
|
||||
"my_modifier": 8
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}'
|
||||
--------------------------------------------------
|
||||
Note that you must have dynamic scripting enabled to use indexed scripts
|
||||
at query time.
|
||||
|
||||
The script can be viewed by:
|
||||
[source,js]
|
||||
-----------------------------------
|
||||
curl -XGET localhost:9200/_scripts/groovy/indexedCalculateScore
|
||||
-----------------------------------
|
||||
|
||||
This is rendered as:
|
||||
|
||||
[source,js]
|
||||
-----------------------------------
|
||||
'{
|
||||
"script": "log(_score * 2) + my_modifier"
|
||||
}'
|
||||
-----------------------------------
|
||||
|
||||
Indexed scripts can be deleted by:
|
||||
[source,js]
|
||||
-----------------------------------
|
||||
curl -XDELETE localhost:9200/_scripts/groovy/indexedCalculateScore
|
||||
-----------------------------------
|
||||
|
||||
|
||||
The default scripting language (assuming no `lang` parameter is
|
||||
provided) is `mvel`. In order to change it set the `script.default_lang`
|
||||
to the appropriate language.
|
||||
|
||||
[float]
|
||||
=== Enabling dynamic scripting
|
||||
|
||||
We recommend running Elasticsearch behind an application or proxy,
|
||||
which protects Elasticsearch from the outside world. If users are
|
||||
allowed to run dynamic scripts (even in a search request), then they
|
||||
have the same access to your box as the user that Elasticsearch is
|
||||
running as. For this reason dynamic scripting is disabled by default.
|
||||
We recommend running Elasticsearch behind an application or proxy, which
|
||||
protects Elasticsearch from the outside world. If users are allowed to run
|
||||
dynamic scripts (even in a search request), then they have the same access to
|
||||
your box as the user that Elasticsearch is running as. For this reason dynamic
|
||||
scripting is allowed only for sandboxed languages by default.
|
||||
|
||||
First, you should not run Elasticsearch as the `root` user, as this would allow
|
||||
a script to access or do *anything* on your server, without limitations. Second,
|
||||
|
@ -109,6 +185,54 @@ _native_ Java scripts registered through plugins, it also allows users to run
|
|||
arbitrary scripts via the API. Instead of sending the name of the file as the
|
||||
script, the body of the script can be sent instead.
|
||||
|
||||
There are three possible configuration values for the `script.disable_dynamic`
|
||||
setting, the default value is `sandbox`:
|
||||
|
||||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Value |Description
|
||||
| `true` |all dynamic scripting is disabled, scripts must be placed in the `config/scripts` directory.
|
||||
| `false` |all dynamic scripting is enabled, scripts may be sent as strings in requests.
|
||||
| `sandbox` |scripts may be sent as strings for languages that are sandboxed.
|
||||
|=======================================================================
|
||||
|
||||
[float]
|
||||
=== Default Scripting Language
|
||||
|
||||
The default scripting language (assuming no `lang` parameter is provided) is
|
||||
`groovy`. In order to change it, set the `script.default_lang` to the
|
||||
appropriate language.
|
||||
|
||||
[float]
|
||||
=== Groovy Sandboxing
|
||||
|
||||
Elasticsearch sandboxes Groovy scripts that are compiled and executed in order
|
||||
to ensure they don't perform unwanted actions. There are a number of options
|
||||
that can be used for configuring this sandbox:
|
||||
|
||||
`script.groovy.sandbox.receiver_whitelist`::
|
||||
|
||||
Comma-separated list of string classes for objects that may have methods
|
||||
invoked.
|
||||
|
||||
`script.groovy.sandbox.package_whitelist`::
|
||||
|
||||
Comma-separated list of packages under which new objects may be constructed.
|
||||
|
||||
`script.groovy.sandbox.class_whitelist`::
|
||||
|
||||
Comma-separated list of classes that are allowed to be constructed.
|
||||
|
||||
`script.groovy.sandbox.method_blacklist`::
|
||||
|
||||
Comma-separated list of methods that are never allowed to be invoked,
|
||||
regardless of target object.
|
||||
|
||||
`script.groovy.sandbox.enabled`::
|
||||
|
||||
Flag to disable the sandbox (defaults to `true` meaning the sandbox is
|
||||
enabled).
|
||||
|
||||
[float]
|
||||
=== Automatic Script Reloading
|
||||
|
||||
|
@ -119,10 +243,11 @@ using `watcher.interval` setting, which defaults to `60s`.
|
|||
To disable script reloading completely set `script.auto_reload_enabled`
|
||||
to `false`.
|
||||
|
||||
[[native-java-scripts]]
|
||||
[float]
|
||||
=== Native (Java) Scripts
|
||||
|
||||
Even though `mvel` is pretty fast, this allows to register native Java based
|
||||
Even though `groovy` is pretty fast, this allows to register native Java based
|
||||
scripts for faster execution.
|
||||
|
||||
In order to allow for scripts, the `NativeScriptFactory` needs to be
|
||||
|
@ -142,14 +267,43 @@ the name of the script as the `script`.
|
|||
|
||||
Note, the scripts need to be in the classpath of elasticsearch. One
|
||||
simple way to do it is to create a directory under plugins (choose a
|
||||
descriptive name), and place the jar / classes files there, they will be
|
||||
descriptive name), and place the jar / classes files there. They will be
|
||||
automatically loaded.
|
||||
|
||||
[float]
|
||||
=== Lucene Expressions Scripts
|
||||
|
||||
[WARNING]
|
||||
========================
|
||||
This feature is *experimental* and subject to change in future versions.
|
||||
========================
|
||||
|
||||
Lucene's expressions module provides a mechanism to compile a
|
||||
`javascript` expression to bytecode. This allows very fast execution,
|
||||
as if you had written a `native` script. Expression scripts can be
|
||||
used in `script_score`, `script_fields`, sort scripts and numeric aggregation scripts.
|
||||
|
||||
See the link:http://lucene.apache.org/core/4_9_0/expressions/index.html?org/apache/lucene/expressions/js/package-summary.html[expressions module documentation]
|
||||
for details on what operators and functions are available.
|
||||
|
||||
Variables in `expression` scripts are available to access:
|
||||
|
||||
* Single valued document fields, e.g. `doc['myfield'].value`
|
||||
* Parameters passed into the script, e.g. `mymodifier`
|
||||
* The current document's score, `_score` (only available when used in a `script_score`)
|
||||
|
||||
There are a few limitations relative to other script languages:
|
||||
|
||||
* Only numeric fields may be accessed
|
||||
* Stored fields are not available
|
||||
* If a field is sparse (only some documents contain a value), documents missing the field will have a value of `0`
|
||||
|
||||
[float]
|
||||
=== Score
|
||||
|
||||
In all scripts that can be used in facets, allow to access the current
|
||||
doc score using `doc.score`.
|
||||
In all scripts that can be used in facets, the current
|
||||
document's score is accessible in `doc.score`. When using a `script_score`,
|
||||
the current score is available in `_score`.
|
||||
|
||||
[float]
|
||||
=== Computing scores based on terms in scripts
|
||||
|
@ -267,7 +421,7 @@ loaded for other purposes.
|
|||
|
||||
|
||||
[float]
|
||||
=== mvel Built In Functions
|
||||
=== Groovy Built In Functions
|
||||
|
||||
There are several built in functions that can be used within scripts.
|
||||
They include:
|
||||
|
@ -275,8 +429,6 @@ They include:
|
|||
[cols="<,<",options="header",]
|
||||
|=======================================================================
|
||||
|Function |Description
|
||||
|`time()` |The current time in milliseconds.
|
||||
|
||||
|`sin(a)` |Returns the trigonometric sine of an angle.
|
||||
|
||||
|`cos(a)` |Returns the trigonometric cosine of an angle.
|
||||
|
@ -362,3 +514,4 @@ integer with the value of `8`, the result is `0` even though you were
|
|||
expecting it to be `0.125`. You may need to enforce precision by
|
||||
explicitly using a double like `1.0/num` in order to get the expected
|
||||
result.
|
||||
|
||||
|
|
|
@ -132,9 +132,9 @@ Snapshotting process is executed in non-blocking fashion. All indexing and searc
|
|||
executed against the index that is being snapshotted. However, a snapshot represents the point-in-time view of the index
|
||||
at the moment when snapshot was created, so no records that were added to the index after snapshot process had started
|
||||
will be present in the snapshot. The snapshot process starts immediately for the primary shards that has been started
|
||||
and are not relocating at the moment. Before version 1.2.0 the snapshot operation fails if cluster has any relocating or
|
||||
and are not relocating at the moment. Before version 1.2.0, the snapshot operation fails if the cluster has any relocating or
|
||||
initializing primaries of indices participating in the snapshot. Starting with version 1.2.0, Elasticsearch waits for
|
||||
are relocating or initializing shards to start before snapshotting them.
|
||||
relocation or initialization of shards to complete before snapshotting them.
|
||||
|
||||
Besides creating a copy of each index the snapshot process can also store global cluster metadata, which includes persistent
|
||||
cluster settings and templates. The transient settings and registered snapshot repositories are not stored as part of
|
||||
|
@ -189,6 +189,7 @@ should be restored as well as prevent global cluster state from being restored b
|
|||
<<search-multi-index-type,multi index syntax>>. The `rename_pattern` and `rename_replacement` options can be also used to
|
||||
rename index on restore using regular expression that supports referencing the original text as explained
|
||||
http://docs.oracle.com/javase/6/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)[here].
|
||||
Set `include_aliases` to `false` to prevent aliases from being restored together with associated indices added[1.3.0].
|
||||
|
||||
[source,js]
|
||||
-----------------------------------
|
||||
|
@ -207,6 +208,16 @@ didn't exist in the cluster. If cluster state is restored, the restored template
|
|||
cluster are added and existing templates with the same name are replaced by the restored templates. The restored
|
||||
persistent settings are added to the existing persistent settings.
|
||||
|
||||
[float]
|
||||
=== Partial restore
|
||||
|
||||
added[1.3.0]
|
||||
|
||||
By default, entire restore operation will fail if one or more indices participating in the operation don't have
|
||||
snapshots of all shards available. It can occur if some shards failed to snapshot for example. It is still possible to
|
||||
restore such indices by setting `partial` to `true`. Please note, that only successfully snapshotted shards will be
|
||||
restored in this case and all missing shards will be recreated empty.
|
||||
|
||||
|
||||
[float]
|
||||
=== Snapshot status
|
||||
|
|
|
@ -64,7 +64,7 @@ next to the given cell.
|
|||
[float]
|
||||
==== Caching
|
||||
|
||||
coming[1.3.0]
|
||||
added[1.3.0]
|
||||
|
||||
The result of the filter is not cached by default. The
|
||||
`_cache` parameter can be set to `true` to turn caching on.
|
||||
|
|
|
@ -45,7 +45,7 @@ The `has_child` filter also accepts a filter instead of a query:
|
|||
[float]
|
||||
==== Min/Max Children
|
||||
|
||||
coming[1.3.0]
|
||||
added[1.3.0]
|
||||
|
||||
The `has_child` filter allows you to specify that a minimum and/or maximum
|
||||
number of children are required to match for the parent doc to be considered
|
||||
|
@ -75,13 +75,24 @@ is specified.
|
|||
[float]
|
||||
==== Memory Considerations
|
||||
|
||||
With the current implementation, all `_parent` field values and all `_id`
|
||||
field values of parent documents are loaded into memory (heap) via field data
|
||||
in order to support fast lookups, so make sure there is enough memory for it.
|
||||
In order to support parent-child joins, all of the (string) parent IDs
|
||||
must be resident in memory (in the <<index-modules-fielddata,field data cache>>.
|
||||
Additionaly, every child document is mapped to its parent using a long
|
||||
value (approximately). It is advisable to keep the string parent ID short
|
||||
in order to reduce memory usage.
|
||||
|
||||
You can check how much memory is being used by the ID cache using the
|
||||
<<indices-stats,indices stats>> or <<cluster-nodes-stats,nodes stats>>
|
||||
APIS, eg:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XGET "http://localhost:9200/_stats/id_cache?pretty&human"
|
||||
--------------------------------------------------
|
||||
|
||||
[float]
|
||||
==== Caching
|
||||
|
||||
The `has_child` filter cannot be cached in the filter cache. The `_cache`
|
||||
and `_cache_key` options are a no-op in this filter. Also any filter that
|
||||
wraps the `has_child` filter either directly or indirectly will not be cached.
|
||||
wraps the `has_child` filter either directly or indirectly will not be cached.
|
||||
|
|
|
@ -46,11 +46,22 @@ The `has_parent` filter also accepts a filter instead of a query:
|
|||
--------------------------------------------------
|
||||
|
||||
[float]
|
||||
==== Memory considerations
|
||||
==== Memory Considerations
|
||||
|
||||
With the current implementation, all `_parent` field values and all `_id`
|
||||
field values of parent documents are loaded into memory (heap) via field data
|
||||
in order to support fast lookups, so make sure there is enough memory for it.
|
||||
In order to support parent-child joins, all of the (string) parent IDs
|
||||
must be resident in memory (in the <<index-modules-fielddata,field data cache>>.
|
||||
Additionaly, every child document is mapped to its parent using a long
|
||||
value (approximately). It is advisable to keep the string parent ID short
|
||||
in order to reduce memory usage.
|
||||
|
||||
You can check how much memory is being used by the ID cache using the
|
||||
<<indices-stats,indices stats>> or <<cluster-nodes-stats,nodes stats>>
|
||||
APIS, eg:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XGET "http://localhost:9200/_stats/id_cache?pretty&human"
|
||||
--------------------------------------------------
|
||||
|
||||
[float]
|
||||
==== Caching
|
||||
|
|
|
@ -1,54 +1,159 @@
|
|||
[[query-dsl-filtered-query]]
|
||||
=== Filtered Query
|
||||
|
||||
A query that applies a filter to the results of another query. This
|
||||
query maps to Lucene `FilteredQuery`.
|
||||
The `filtered` query is used to combine another query with any
|
||||
<<query-dsl-filters,filter>>. Filters are usually faster than queries because:
|
||||
|
||||
* they don't have to calculate the relevance `_score` for each document --
|
||||
the answer is just a boolean ``Yes, the document matches the filter'' or
|
||||
``No, the document does not match the filter''.
|
||||
* the results from most filters can be cached in memory, making subsequent
|
||||
executions faster.
|
||||
|
||||
TIP: Exclude as many document as you can with a filter, then query just the
|
||||
documents that remain.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"filtered" : {
|
||||
"query" : {
|
||||
"term" : { "tag" : "wow" }
|
||||
},
|
||||
"filter" : {
|
||||
"range" : {
|
||||
"age" : { "from" : 10, "to" : 20 }
|
||||
}
|
||||
}
|
||||
"filtered": {
|
||||
"query": {
|
||||
"match": { "tweet": "full text search" }
|
||||
},
|
||||
"filter": {
|
||||
"range": { "created": { "gte": "now - 1d / d" }}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
The filter object can hold only filter elements, not queries. Filters
|
||||
can be much faster compared to queries since they don't perform any
|
||||
scoring, especially when they are cached.
|
||||
The `filtered` query can be used wherever a `query` is expected, for instance,
|
||||
to use the above example in search request:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XGET localhost:9200/_search -d '
|
||||
{
|
||||
"query": {
|
||||
"filtered": { <1>
|
||||
"query": {
|
||||
"match": { "tweet": "full text search" }
|
||||
},
|
||||
"filter": {
|
||||
"range": { "created": { "gte": "now - 1d / d" }}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
'
|
||||
--------------------------------------------------
|
||||
<1> The `filtered` query is passed as the value of the `query`
|
||||
parameter in the search request.
|
||||
|
||||
==== Filtering without a query
|
||||
|
||||
If a `query` is not specified, it defaults to the
|
||||
<<query-dsl-match-all-query,`match_all` query>>. This means that the
|
||||
`filtered` query can be used to wrap just a filter, so that it can be used
|
||||
wherever a query is expected.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XGET localhost:9200/_search -d '
|
||||
{
|
||||
"query": {
|
||||
"filtered": { <1>
|
||||
"filter": {
|
||||
"range": { "created": { "gte": "now - 1d / d" }}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
'
|
||||
--------------------------------------------------
|
||||
<1> No `query` has been specfied, so this request applies just the filter,
|
||||
returning all documents created since yesterday.
|
||||
|
||||
==== Multiple filters
|
||||
|
||||
Multiple filters can be applied by wrapping them in a
|
||||
<<query-dsl-bool-filter,`bool` filter>>, for example:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"filtered": {
|
||||
"query": { "match": { "tweet": "full text search" }},
|
||||
"filter": {
|
||||
"bool": {
|
||||
"must": { "range": { "created": { "gte": "now - 1d / d" }}},
|
||||
"should": [
|
||||
{ "term": { "featured": true }},
|
||||
{ "term": { "starred": true }}
|
||||
],
|
||||
"must_not": { "term": { "deleted": false }}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
Similarly, multiple queries can be combined with a
|
||||
<<query-dsl-bool-query,`bool` query>>.
|
||||
|
||||
==== Filter strategy
|
||||
|
||||
The filtered query allows to configure how to intersect the filter with the query:
|
||||
You can control how the filter and query are executed with the `strategy`
|
||||
parameter:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"filtered" : {
|
||||
"query" : {
|
||||
// query definition
|
||||
},
|
||||
"filter" : {
|
||||
// filter definition
|
||||
},
|
||||
"query" : { ... },
|
||||
"filter" : { ... ],
|
||||
"strategy": "leap_frog"
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
[horizontal]
|
||||
`leap_frog_query_first`:: Look for the first document matching the query, and then alternatively advance the query and the filter to find common matches.
|
||||
`leap_frog_filter_first`:: Look for the first document matching the filter, and then alternatively advance the query and the filter to find common matches.
|
||||
`leap_frog`:: Same as `leap_frog_query_first`.
|
||||
`query_first`:: If the filter supports random access, then search for documents using the query, and then consult the filter to check whether there is a match. Otherwise fall back to `leap_frog_query_first`.
|
||||
`random_access_${threshold}`:: If the filter supports random access and if there is at least one matching document among the first `threshold` ones, then apply the filter first. Otherwise fall back to `leap_frog_query_first`. `${threshold}` must be greater than or equal to `1`.
|
||||
`random_access_always`:: Apply the filter first if it supports random access. Otherwise fall back to `leap_frog_query_first`.
|
||||
IMPORTANT: This is an _expert-level_ setting. Most users can simply ignore it.
|
||||
|
||||
The default strategy is to use `query_first` on filters that are not advanceable such as geo filters and script filters, and `random_access_100` on other filters.
|
||||
The `strategy` parameter accepts the following options:
|
||||
|
||||
[horizontal]
|
||||
`leap_frog_query_first`::
|
||||
|
||||
Look for the first document matching the query, and then alternatively
|
||||
advance the query and the filter to find common matches.
|
||||
|
||||
`leap_frog_filter_first`::
|
||||
|
||||
Look for the first document matching the filter, and then alternatively
|
||||
advance the query and the filter to find common matches.
|
||||
|
||||
`leap_frog`::
|
||||
|
||||
Same as `leap_frog_query_first`.
|
||||
|
||||
`query_first`::
|
||||
|
||||
If the filter supports random access, then search for documents using the
|
||||
query, and then consult the filter to check whether there is a match.
|
||||
Otherwise fall back to `leap_frog_query_first`.
|
||||
|
||||
`random_access_${threshold}`::
|
||||
|
||||
If the filter supports random access and if there is at least one matching
|
||||
document among the first `threshold` ones, then apply the filter first.
|
||||
Otherwise fall back to `leap_frog_query_first`. `${threshold}` must be
|
||||
greater than or equal to `1`.
|
||||
|
||||
`random_access_always`::
|
||||
|
||||
Apply the filter first if it supports random access. Otherwise fall back
|
||||
to `leap_frog_query_first`.
|
||||
|
||||
The default strategy is to use `query_first` on filters that are not
|
||||
advanceable such as geo filters and script filters, and `random_access_100` on
|
||||
other filters.
|
||||
|
|
|
@ -57,7 +57,7 @@ given filter:
|
|||
If no filter is given with a function this is equivalent to specifying
|
||||
`"match_all": {}`
|
||||
|
||||
First, each document is scored by the defined functons. The parameter
|
||||
First, each document is scored by the defined functions. The parameter
|
||||
`score_mode` specifies how the computed scores are combined:
|
||||
|
||||
[horizontal]
|
||||
|
@ -151,6 +151,9 @@ that is initialized with a `seed`.
|
|||
--------------------------------------------------
|
||||
|
||||
===== Field Value factor
|
||||
|
||||
added[1.2.0]
|
||||
|
||||
The `field_value_factor` function allows you to use a field from a document to
|
||||
influence the score. It's similar to using the `script_score` function, however,
|
||||
it avoids the overhead of scripting. If used on a multi-valued field, only the
|
||||
|
@ -270,18 +273,33 @@ Normal decay, computed as:
|
|||
+
|
||||
image:images/Gaussian.png[]
|
||||
|
||||
where image:images/sigma.png[] is computed to assure that the score takes the value `decay` at distance `scale` from `origin`+-`offset`
|
||||
|
||||
image:images/sigma_calc.png[]
|
||||
|
||||
[horizontal]
|
||||
`exp`::
|
||||
|
||||
Exponential decay, computed as:
|
||||
+
|
||||
image:images/Exponential.png[]
|
||||
|
||||
where again the parameter image:images/lambda.png[] is computed to assure that the score takes the value `decay` at distance `scale` from `origin`+-`offset`
|
||||
|
||||
image:images/lambda_calc.png[]
|
||||
|
||||
[horizontal]
|
||||
`linear`::
|
||||
|
||||
Linear decay, computed as:
|
||||
+
|
||||
image:images/Linear.png[].
|
||||
+
|
||||
|
||||
|
||||
where again the parameter `s` is computed to assure that the score takes the value `decay` at distance `scale` from `origin`+-`offset`
|
||||
|
||||
image:images/s_calc.png[]
|
||||
|
||||
In contrast to the normal and exponential decay, this function actually
|
||||
sets the score to 0 if the field value exceeds twice the user given
|
||||
scale value.
|
||||
|
|
|
@ -56,7 +56,7 @@ inside the `has_child` query:
|
|||
[float]
|
||||
==== Min/Max Children
|
||||
|
||||
coming[1.3.0]
|
||||
added[1.3.0]
|
||||
|
||||
The `has_child` query allows you to specify that a minimum and/or maximum
|
||||
number of children are required to match for the parent doc to be considered
|
||||
|
@ -86,6 +86,19 @@ the `score_mode` parameter.
|
|||
[float]
|
||||
==== Memory Considerations
|
||||
|
||||
With the current implementation, all `_parent` field values and all `_id`
|
||||
field values of parent documents are loaded into memory (heap) via field data
|
||||
in order to support fast lookups, so make sure there is enough memory for it.
|
||||
In order to support parent-child joins, all of the (string) parent IDs
|
||||
must be resident in memory (in the <<index-modules-fielddata,field data cache>>.
|
||||
Additionaly, every child document is mapped to its parent using a long
|
||||
value (approximately). It is advisable to keep the string parent ID short
|
||||
in order to reduce memory usage.
|
||||
|
||||
You can check how much memory is being used by the ID cache using the
|
||||
<<indices-stats,indices stats>> or <<cluster-nodes-stats,nodes stats>>
|
||||
APIS, eg:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XGET "http://localhost:9200/_stats/id_cache?pretty&human"
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -52,6 +52,19 @@ matching parent document. The score type can be specified with the
|
|||
[float]
|
||||
==== Memory Considerations
|
||||
|
||||
With the current implementation, all `_parent` field values and all `_id`
|
||||
field values of parent documents are loaded into memory (heap) via field data
|
||||
in order to support fast lookups, so make sure there is enough memory for it.
|
||||
In order to support parent-child joins, all of the (string) parent IDs
|
||||
must be resident in memory (in the <<index-modules-fielddata,field data cache>>.
|
||||
Additionaly, every child document is mapped to its parent using a long
|
||||
value (approximately). It is advisable to keep the string parent ID short
|
||||
in order to reduce memory usage.
|
||||
|
||||
You can check how much memory is being used by the ID cache using the
|
||||
<<indices-stats,indices stats>> or <<cluster-nodes-stats,nodes stats>>
|
||||
APIS, eg:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XGET "http://localhost:9200/_stats/id_cache?pretty&human"
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -98,13 +98,6 @@ The `cutoff_frequency` can either be relative to the number of documents
|
|||
in the index if in the range `[0..1)` or absolute if greater or equal to
|
||||
`1.0`.
|
||||
|
||||
Note: If the `cutoff_frequency` is used and the operator is `and`
|
||||
_stacked tokens_ (tokens that are on the same position like `synonym` filter emits)
|
||||
are not handled gracefully as they are in a pure `and` query. For instance the query
|
||||
`fast fox` is analyzed into 3 terms `[fast, quick, fox]` where `quick` is a synonym
|
||||
for `fast` on the same token positions the query might require `fast` and `quick` to
|
||||
match if the operator is `and`.
|
||||
|
||||
Here is an example showing a query composed of stopwords exclusivly:
|
||||
|
||||
[source,js]
|
||||
|
|
|
@ -25,7 +25,7 @@ Fields can be specified with wildcards, eg:
|
|||
--------------------------------------------------
|
||||
{
|
||||
"multi_match" : {
|
||||
"query": "Will Smith"
|
||||
"query": "Will Smith",
|
||||
"fields": [ "title", "*_name" ] <1>
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,7 +52,7 @@ fields referenced inside the query must use the complete path (fully
|
|||
qualified).
|
||||
|
||||
The `score_mode` allows to set how inner children matching affects
|
||||
scoring of parent. It defaults to `avg`, but can be `total`, `max` and
|
||||
scoring of parent. It defaults to `avg`, but can be `sum`, `max` and
|
||||
`none`.
|
||||
|
||||
Multi level nesting is automatically supported, and detected, resulting
|
||||
|
|
|
@ -23,8 +23,10 @@ search terms, but it is possible to specify other fields in the query syntax:
|
|||
|
||||
status:active
|
||||
|
||||
* where the `title` field contains `quick` or `brown`
|
||||
* where the `title` field contains `quick` or `brown`.
|
||||
If you omit the OR operator the default operator will be used
|
||||
|
||||
title:(quick OR brown)
|
||||
title:(quick brown)
|
||||
|
||||
* where the `author` field contains the exact phrase `"john smith"`
|
||||
|
@ -133,7 +135,7 @@ curly brackets `{min TO max}`.
|
|||
|
||||
* All days in 2012:
|
||||
|
||||
date:[2012/01/01 TO 2012/12/31]
|
||||
date:[2012-01-01 TO 2012-12-31]
|
||||
|
||||
* Numbers 1..5
|
||||
|
||||
|
@ -149,7 +151,7 @@ curly brackets `{min TO max}`.
|
|||
|
||||
* Dates before 2012
|
||||
|
||||
date:{* TO 2012/01/01}
|
||||
date:{* TO 2012-01-01}
|
||||
|
||||
Curly and square brackets can be combined:
|
||||
|
||||
|
|
|
@ -3,6 +3,9 @@
|
|||
|
||||
The `regexp` query allows you to use regular expression term queries.
|
||||
See <<regexp-syntax>> for details of the supported regular expression language.
|
||||
The "term queries" in that first sentence means that Elasticsearch will apply
|
||||
the regexp to the terms produced by the tokenizer for that field, and not
|
||||
to the original text of the field.
|
||||
|
||||
*Note*: The performance of a `regexp` query heavily depends on the
|
||||
regular expression chosen. Matching everything like `.*` is very slow as
|
||||
|
@ -49,7 +52,7 @@ You can also use special flags
|
|||
|
||||
Possible flags are `ALL`, `ANYSTRING`, `AUTOMATON`, `COMPLEMENT`,
|
||||
`EMPTY`, `INTERSECTION`, `INTERVAL`, or `NONE`. Please check the
|
||||
http://lucene.apache.org/core/4_3_0/core/index.html?org%2Fapache%2Flucene%2Futil%2Fautomaton%2FRegExp.html[Lucene
|
||||
http://lucene.apache.org/core/4_9_0/core/org/apache/lucene/util/automaton/RegExp.html[Lucene
|
||||
documentation] for their meaning
|
||||
|
||||
|
||||
|
|
|
@ -95,6 +95,46 @@ which is then turned into:
|
|||
}
|
||||
------------------------------------------
|
||||
|
||||
added[1.3.0]
|
||||
|
||||
You can register a template by storing it in the elasticsearch index `.scripts` or by using the REST API. (See <<search-template>> for more details)
|
||||
In order to execute the stored template, reference it by name in the `query`
|
||||
parameter:
|
||||
|
||||
|
||||
[source,js]
|
||||
------------------------------------------
|
||||
GET /_search
|
||||
{
|
||||
"query": {
|
||||
"template": {
|
||||
"query": "templateName", <1>
|
||||
"params" : {
|
||||
"template" : "all"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
------------------------------------------
|
||||
<1> Name of the the query template stored in the index.
|
||||
|
||||
[source,js]
|
||||
------------------------------------------
|
||||
GET /_search
|
||||
{
|
||||
"query": {
|
||||
"template": {
|
||||
"query": "storedTemplate", <1>
|
||||
"params" : {
|
||||
"template" : "all"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
------------------------------------------
|
||||
|
||||
|
||||
There is also a dedicated `template` endpoint, allows you to template an entire search request.
|
||||
Please see <<search-template>> for more details.
|
||||
|
||||
|
|
|
@ -66,6 +66,19 @@ same scope name that will work against the child documents. For example:
|
|||
[float]
|
||||
==== Memory Considerations
|
||||
|
||||
With the current implementation, all `_parent` field values and all `_id`
|
||||
field values of parent documents are loaded into memory (heap) via field data
|
||||
in order to support fast lookups, so make sure there is enough memory for it.
|
||||
In order to support parent-child joins, all of the (string) parent IDs
|
||||
must be resident in memory (in the <<index-modules-fielddata,field data cache>>.
|
||||
Additionaly, every child document is mapped to its parent using a long
|
||||
value (approximately). It is advisable to keep the string parent ID short
|
||||
in order to reduce memory usage.
|
||||
|
||||
You can check how much memory is being used by the ID cache using the
|
||||
<<indices-stats,indices stats>> or <<cluster-nodes-stats,nodes stats>>
|
||||
APIS, eg:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XGET "http://localhost:9200/_stats/id_cache?pretty&human"
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -83,10 +83,12 @@ include::search/request-body.asciidoc[]
|
|||
|
||||
include::search/search-template.asciidoc[]
|
||||
|
||||
include::search/facets.asciidoc[]
|
||||
include::search/search-shards.asciidoc[]
|
||||
|
||||
include::search/aggregations.asciidoc[]
|
||||
|
||||
include::search/facets.asciidoc[]
|
||||
|
||||
include::search/suggesters.asciidoc[]
|
||||
|
||||
include::search/multi-search.asciidoc[]
|
||||
|
|
|
@ -82,7 +82,7 @@ By default, the distance unit is `km` but it can also accept: `mi` (miles), `in`
|
|||
|
||||
<1> The distances will be computed as miles
|
||||
|
||||
There are two distance calculation modes: `sloppy_arc` (the default), `arc` (most accurate) and `plane` (fastest). The `arc` calculation is the most accurate one but also the more expensive one in terms of performance. The `sloppy_arc` is faster but less accurate. The `plane` is the fastest but least accurate distance function. Consider using `plane` when your search context is "narrow" and spans smaller geographical areas (like cities or even countries). `plane` may return higher error mergins for searches across very large areas (e.g. cross continent search). The distance calculation type can be set using the `distance_type` parameter:
|
||||
There are three distance calculation modes: `sloppy_arc` (the default), `arc` (most accurate) and `plane` (fastest). The `arc` calculation is the most accurate one but also the more expensive one in terms of performance. The `sloppy_arc` is faster but less accurate. The `plane` is the fastest but least accurate distance function. Consider using `plane` when your search context is "narrow" and spans smaller geographical areas (like cities or even countries). `plane` may return higher error mergins for searches across very large areas (e.g. cross continent search). The distance calculation type can be set using the `distance_type` parameter:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
|
|
@ -142,7 +142,7 @@ Example:
|
|||
--------------------------------------------------
|
||||
{
|
||||
"query" : {
|
||||
"filtered" : { "range" : { "price" : { "to" : "500" } } }
|
||||
"filtered" : { "filter": { "range" : { "price" : { "to" : "500" } } } }
|
||||
},
|
||||
"aggs" : {
|
||||
"prices" : {
|
||||
|
|
|
@ -34,7 +34,7 @@ The following aggregations will return the minimum price products can be purchas
|
|||
{
|
||||
"query" : {
|
||||
"match" : { "name" : "led tv" }
|
||||
}
|
||||
},
|
||||
"aggs" : {
|
||||
"resellers" : {
|
||||
"nested" : {
|
||||
|
|
|
@ -194,10 +194,7 @@ where a simple `terms` aggregation would typically show the very popular "consta
|
|||
|
||||
.How are the scores calculated?
|
||||
**********************************
|
||||
The numbers returned for scores are primarily intended for ranking different suggestions sensibly rather than something easily understood by end users.
|
||||
The scores are derived from the doc frequencies in _foreground_ and _background_ sets. The _absolute_ change in popularity (foregroundPercent - backgroundPercent) would favour
|
||||
common terms whereas the _relative_ change in popularity (foregroundPercent/ backgroundPercent) would favour rare terms.
|
||||
Rare vs common is essentially a precision vs recall balance and so the absolute and relative changes are multiplied to provide a sweet spot between precision and recall.
|
||||
The numbers returned for scores are primarily intended for ranking different suggestions sensibly rather than something easily understood by end users. The scores are derived from the doc frequencies in _foreground_ and _background_ sets. In brief, a term is considered significant if there is a noticeable difference in the frequency in which a term appears in the subset and in the background. The way the terms are ranked can be configured, see "Parameters" section.
|
||||
|
||||
**********************************
|
||||
|
||||
|
@ -282,7 +279,35 @@ However, the `size` and `shard size` settings covered in the next section provid
|
|||
|
||||
==== Parameters
|
||||
|
||||
===== JLH score
|
||||
|
||||
The scores are derived from the doc frequencies in _foreground_ and _background_ sets. The _absolute_ change in popularity (foregroundPercent - backgroundPercent) would favor common terms whereas the _relative_ change in popularity (foregroundPercent/ backgroundPercent) would favor rare terms. Rare vs common is essentially a precision vs recall balance and so the absolute and relative changes are multiplied to provide a sweet spot between precision and recall.
|
||||
|
||||
===== mutual information
|
||||
added[1.3.0]
|
||||
|
||||
Mutual information as described in "Information Retrieval", Manning et al., Chapter 13.5.1 can be used as significance score by adding the parameter
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
||||
"mutual_information": {
|
||||
"include_negatives": true
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
Mutual information does not differentiate between terms that are descriptive for the subset or for documents outside the subset. The significant terms therefore can contain terms that appear more or less frequent in the subset than outside the subset. To filter out the terms that appear less often in the subset than in documents outside the subset, `include_negatives` can be set to `false`.
|
||||
|
||||
Per default, the assumption is that the documents in the bucket are also contained in the background. If instead you defined a custom background filter that represents a different set of documents that you want to compare to, set
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
||||
"background_is_superset": false
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
|
||||
===== Size & Shard Size
|
||||
|
||||
The `size` parameter can be set to define how many term buckets should be returned out of the overall terms list. By
|
||||
|
@ -338,7 +363,7 @@ Terms that score highly will be collected on a shard level and merged with the t
|
|||
|
||||
added[1.2.0] `shard_min_doc_count` parameter
|
||||
|
||||
The parameter `shard_min_doc_count` regulates the _certainty_ a shard has if the term should actually be added to the candidate list or not with respect to the `min_doc_count`. Terms will only be considered if their local shard frequency within the set is higher than the `shard_min_doc_count`. If your dictionary contains many low frequent words and you are not interested in these (for example misspellings), then you can set the `shard_min_doc_count` parameter to filter out candidate terms on a shard level that will with a resonable certainty not reach the required `min_doc_count` even after merging the local frequencies. `shard_min_doc_count` is set to `1` per default and has no effect unless you explicitly set it.
|
||||
The parameter `shard_min_doc_count` regulates the _certainty_ a shard has if the term should actually be added to the candidate list or not with respect to the `min_doc_count`. Terms will only be considered if their local shard frequency within the set is higher than the `shard_min_doc_count`. If your dictionary contains many low frequent words and you are not interested in these (for example misspellings), then you can set the `shard_min_doc_count` parameter to filter out candidate terms on a shard level that will with a reasonable certainty not reach the required `min_doc_count` even after merging the local frequencies. `shard_min_doc_count` is set to `1` per default and has no effect unless you explicitly set it.
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ Response:
|
|||
By default, the `terms` aggregation will return the buckets for the top ten terms ordered by the `doc_count`. One can
|
||||
change this default behaviour by setting the `size` parameter.
|
||||
|
||||
==== Size & Shard Size
|
||||
==== Size
|
||||
|
||||
The `size` parameter can be set to define how many term buckets should be returned out of the overall terms list. By
|
||||
default, the node coordinating the search process will request each shard to provide its own top `size` term buckets
|
||||
|
@ -52,6 +52,87 @@ This means that if the number of unique terms is greater than `size`, the return
|
|||
(it could be that the term counts are slightly off and it could even be that a term that should have been in the top
|
||||
size buckets was not returned). If set to `0`, the `size` will be set to `Integer.MAX_VALUE`.
|
||||
|
||||
==== Document counts are approximate
|
||||
|
||||
As described above, the document counts (and the results of any sub aggregations) in the terms aggregation are not always
|
||||
accurate. This is because each shard provides its own view of what the ordered list of terms should be and these are
|
||||
combined to give a final view. Consider the following scenario:
|
||||
|
||||
A request is made to obtain the top 5 terms in the field product, ordered by descending document count from an index with
|
||||
3 shards. In this case each shard is asked to give its top 5 terms.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"aggs" : {
|
||||
"products" : {
|
||||
"terms" : {
|
||||
"field" : "product",
|
||||
"size" : 5
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
The terms for each of the three shards are shown below with their
|
||||
respective document counts in brackets:
|
||||
|
||||
[width="100%",cols="^2,^2,^2,^2",options="header"]
|
||||
|=========================================================
|
||||
| | Shard A | Shard B | Shard C
|
||||
|
||||
| 1 | Product A (25) | Product A (30) | Product A (45)
|
||||
| 2 | Product B (18) | Product B (25) | Product C (44)
|
||||
| 3 | Product C (6) | Product F (17) | Product Z (36)
|
||||
| 4 | Product D (3) | Product Z (16) | Product G (30)
|
||||
| 5 | Product E (2) | Product G (15) | Product E (29)
|
||||
| 6 | Product F (2) | Product H (14) | Product H (28)
|
||||
| 7 | Product G (2) | Product I (10) | Product Q (2)
|
||||
| 8 | Product H (2) | Product Q (6) | Product D (1)
|
||||
| 9 | Product I (1) | Product J (8) |
|
||||
| 10 | Product J (1) | Product C (4) |
|
||||
|
||||
|=========================================================
|
||||
|
||||
The shards will return their top 5 terms so the results from the shards will be:
|
||||
|
||||
|
||||
[width="100%",cols="^2,^2,^2,^2",options="header"]
|
||||
|=========================================================
|
||||
| | Shard A | Shard B | Shard C
|
||||
|
||||
| 1 | Product A (25) | Product A (30) | Product A (45)
|
||||
| 2 | Product B (18) | Product B (25) | Product C (44)
|
||||
| 3 | Product C (6) | Product F (17) | Product Z (36)
|
||||
| 4 | Product D (3) | Product Z (16) | Product G (30)
|
||||
| 5 | Product E (2) | Product G (15) | Product E (29)
|
||||
|
||||
|=========================================================
|
||||
|
||||
Taking the top 5 results from each of the shards (as requested) and combining them to make a final top 5 list produces
|
||||
the following:
|
||||
|
||||
[width="40%",cols="^2,^2"]
|
||||
|=========================================================
|
||||
|
||||
| 1 | Product A (100)
|
||||
| 2 | Product Z (52)
|
||||
| 3 | Product C (50)
|
||||
| 4 | Product G (45)
|
||||
| 5 | Product B (43)
|
||||
|
||||
|=========================================================
|
||||
|
||||
Because Product A was returned from all shards we know that its document count value is accurate. Product C was only
|
||||
returned by shards A and C so its document count is shown as 50 but this is not an accurate count. Product C exists on
|
||||
shard B, but its count of 4 was not high enough to put Product C into the top 5 list for that shard. Product Z was also
|
||||
returned only by 2 shards but the third shard does not contain the term. There is no way of knowing, at the point of
|
||||
combining the results to produce the final list of terms, that there is an error in the document count for Product C and
|
||||
not for Product Z. Product H has a document count of 44 across all 3 shards but was not included in the final list of
|
||||
terms because it did not make it into the top five terms on any of the shards.
|
||||
|
||||
==== Shard Size
|
||||
|
||||
The higher the requested `size` is, the more accurate the results will be, but also, the more expensive it will be to
|
||||
compute the final results (both due to bigger priority queues that are managed on a shard level and due to bigger data
|
||||
|
@ -70,6 +151,81 @@ NOTE: `shard_size` cannot be smaller than `size` (as it doesn't make much sens
|
|||
added[1.1.0] It is possible to not limit the number of terms that are returned by setting `size` to `0`. Don't use this
|
||||
on high-cardinality fields as this will kill both your CPU since terms need to be return sorted, and your network.
|
||||
|
||||
==== Calculating Document Count Error
|
||||
|
||||
coming[1.4.0]
|
||||
|
||||
There are two error values which can be shown on the terms aggregation. The first gives a value for the aggregation as
|
||||
a whole which represents the maximum potential document count for a term which did not make it into the final list of
|
||||
terms. This is calculated as the sum of the document count from the last term returned from each shard .For the example
|
||||
given above the value would be 46 (2 + 15 + 29). This means that in the worst case scenario a term which was not returned
|
||||
could have the 4th highest document count.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
...
|
||||
|
||||
"aggregations" : {
|
||||
"products" : {
|
||||
"doc_count_error_upper_bound" : 46,
|
||||
"buckets" : [
|
||||
{
|
||||
"key" : "Product A",
|
||||
"doc_count" : 100
|
||||
},
|
||||
{
|
||||
"key" : "Product Z",
|
||||
"doc_count" : 52
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
The second error value can be enabled by setting the `show_term_doc_count_error` parameter to true. This shows an error value
|
||||
for each term returned by the aggregation which represents the 'worst case' error in the document count and can be useful when
|
||||
deciding on a value for the `shard_size` parameter. This is calculated by summing the document counts for the last term returned
|
||||
by all shards which did not return the term. In the example above the error in the document count for Product C would be 15 as
|
||||
Shard B was the only shard not to return the term and the document count of the last termit did return was 15. The actual document
|
||||
count of Product C was 54 so the document count was only actually off by 4 even though the worst case was that it would be off by
|
||||
15. Product A, however has an error of 0 for its document count, since every shard returned it we can be confident that the count
|
||||
returned is accurate.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
...
|
||||
|
||||
"aggregations" : {
|
||||
"products" : {
|
||||
"doc_count_error_upper_bound" : 46,
|
||||
"buckets" : [
|
||||
{
|
||||
"key" : "Product A",
|
||||
"doc_count" : 100,
|
||||
"doc_count_error_upper_bound" : 0
|
||||
},
|
||||
{
|
||||
"key" : "Product Z",
|
||||
"doc_count" : 52,
|
||||
"doc_count_error_upper_bound" : 2
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
These errors can only be calculated in this way when the terms are ordered by descending document count. When the aggregation is
|
||||
ordered by the terms values themselves (either ascending or descending) there is no error in the document count since if a shard
|
||||
does not return a particular term which appears in the results from another shard, it must not have that term in its index. When the
|
||||
aggregation is either sorted by a sub aggregation or in order of ascending document count, the error in the document counts cannot be
|
||||
determined and is given a value of -1 to indicate this.
|
||||
|
||||
==== Order
|
||||
|
||||
The order of the buckets can be customized by setting the `order` parameter. By default, the buckets are ordered by
|
||||
|
@ -322,7 +478,7 @@ http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES
|
|||
|
||||
==== Collect mode
|
||||
|
||||
coming[1.3.0] Deferring calculation of child aggregations
|
||||
added[1.3.0] Deferring calculation of child aggregations
|
||||
|
||||
For fields with many unique terms and a small number of required results it can be more efficient to delay the calculation
|
||||
of child aggregations until the top parent-level aggs have been pruned. Ordinarily, all branches of the aggregation tree
|
||||
|
@ -395,15 +551,32 @@ this would typically be too costly in terms of RAM.
|
|||
|
||||
==== Execution hint
|
||||
|
||||
added[1.2.0] The `global_ordinals` execution mode
|
||||
added[1.2.0] Added the `global_ordinals`, `global_ordinals_hash` and `global_ordinals_low_cardinality` execution modes
|
||||
|
||||
There are three mechanisms by which terms aggregations can be executed: either by using field values directly in order to aggregate
|
||||
data per-bucket (`map`), by using ordinals of the field values instead of the values themselves (`ordinals`) or by using global
|
||||
ordinals of the field (`global_ordinals`). The latter is faster, especially for fields with many unique
|
||||
values. However it can be slower if only a few documents match, when for example a terms aggregator is nested in another
|
||||
aggregator, this applies for both `ordinals` and `global_ordinals` execution modes. Elasticsearch tries to have sensible
|
||||
defaults when it comes to the execution mode that should be used, but in case you know that one execution mode may
|
||||
perform better than the other one, you have the ability to "hint" it to Elasticsearch:
|
||||
deprecated[1.3.0] Removed the `ordinals` execution mode
|
||||
|
||||
There are different mechanisms by which terms aggregations can be executed:
|
||||
|
||||
- by using field values directly in order to aggregate data per-bucket (`map`)
|
||||
- by using ordinals of the field and preemptively allocating one bucket per ordinal value (`global_ordinals`)
|
||||
- by using ordinals of the field and dynamically allocating one bucket per ordinal value (`global_ordinals_hash`)
|
||||
- by using per-segment ordinals to compute counts and remap these counts to global counts using global ordinals (`global_ordinals_low_cardinality`)
|
||||
|
||||
Elasticsearch tries to have sensible defaults so this is something that generally doesn't need to be configured.
|
||||
|
||||
`map` should only be considered when very few documents match a query. Otherwise the ordinals-based execution modes
|
||||
are significantly faster. By default, `map` is only used when running an aggregation on scripts, since they don't have
|
||||
ordinals.
|
||||
|
||||
`global_ordinals_low_cardinality` only works for leaf terms aggregations but is usually the fastest execution mode. Memory
|
||||
usage is linear with the number of unique values in the field, so it is only enabled by default on low-cardinality fields.
|
||||
|
||||
`global_ordinals` is the second fastest option, but the fact that it preemptively allocates buckets can be memory-intensive,
|
||||
especially if you have one or more sub aggregations. It is used by default on top-level terms aggregations.
|
||||
|
||||
`global_ordinals_hash` on the contrary to `global_ordinals` and `global_ordinals_low_cardinality` allocates buckets dynamically
|
||||
so memory usage is linear to the number of values of the documents that are part of the aggregation scope. It is used by default
|
||||
in inner aggregations.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
@ -419,6 +592,6 @@ perform better than the other one, you have the ability to "hint" it to Elastics
|
|||
}
|
||||
--------------------------------------------------
|
||||
|
||||
<1> the possible values are `map`, `ordinals` and `global_ordinals`
|
||||
<1> the possible values are `map`, `global_ordinals`, `global_ordinals_hash` and `global_ordinals_low_cardinality`
|
||||
|
||||
Please note that Elasticsearch will ignore this execution hint if it is not applicable.
|
||||
Please note that Elasticsearch will ignore this execution hint if it is not applicable and that there is no backward compatibility guarantee on these hints.
|
||||
|
|
|
@ -16,6 +16,8 @@ include::metrics/valuecount-aggregation.asciidoc[]
|
|||
|
||||
include::metrics/percentile-aggregation.asciidoc[]
|
||||
|
||||
include::metrics/percentile-rank-aggregation.asciidoc[]
|
||||
|
||||
include::metrics/cardinality-aggregation.asciidoc[]
|
||||
|
||||
include::metrics/geobounds-aggregation.asciidoc[]
|
||||
|
|