diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000..b5fb80d3 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @hdhalter @kolchfa-aws @Naarcha-AWS @vagimeli @AMoo-Miki @natebower @dlvenable @scrawfor99 diff --git a/.github/ISSUE_TEMPLATE/broken_links.md b/.github/ISSUE_TEMPLATE/broken_links.md new file mode 100644 index 00000000..f9d38758 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/broken_links.md @@ -0,0 +1,7 @@ +--- +title: '[AUTOCUT] Broken links' +labels: 'bug' +--- + +Links checker has failed on push of your commit. +Please examine the workflow log {{ env.WORKFLOW_URL }}. diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE/issue_template.md new file mode 100644 index 00000000..0d8b3522 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -0,0 +1,20 @@ +--- +name: 📃 Documentation issue +about: Need docs? Create an issue to request or add new documentation. +title: '[DOC]' +labels: 'untriaged' +assignees: '' +--- + +**What do you want to do?** + +- [ ] Request a change to existing documentation +- [ ] Add new documentation +- [ ] Report a technical problem with the documentation +- [ ] Other + +**Tell us about your request.** Provide a summary of the request and all versions that are affected. + + +**What other resources are available?** Provide links to related issues, POCs, steps for testing, etc. + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 00000000..7eccae70 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,10 @@ +### Description +_Describe what this change achieves._ + +### Issues Resolved +_List any issues this PR will resolve, e.g. Closes [...]._ + + +### Checklist +- [ ] By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license and subject to the [Developers Certificate of Origin](https://github.com/opensearch-project/OpenSearch/blob/main/CONTRIBUTING.md#developer-certificate-of-origin). +For more information on following Developer Certificate of Origin and signing off your commits, please check [here](https://github.com/opensearch-project/OpenSearch/blob/main/CONTRIBUTING.md#developer-certificate-of-origin). diff --git a/.github/dco.yml b/.github/dco.yml new file mode 100644 index 00000000..37e411e1 --- /dev/null +++ b/.github/dco.yml @@ -0,0 +1,2 @@ +require: + members: false \ No newline at end of file diff --git a/.github/vale/styles/OpenSearch/AcronymParentheses.yml b/.github/vale/styles/OpenSearch/AcronymParentheses.yml new file mode 100644 index 00000000..cb085104 --- /dev/null +++ b/.github/vale/styles/OpenSearch/AcronymParentheses.yml @@ -0,0 +1,75 @@ +extends: conditional +message: "'%s': Spell out acronyms the first time that you use them on a page and follow them with the acronym in parentheses. Subsequently, use the acronym alone." +link: 'https://github.com/opensearch-project/documentation-website/blob/main/STYLE_GUIDE.md#acronyms' +level: warning +scope: summary +ignorecase: false +# Ensures that the existence of 'first' implies the existence of 'second'. +first: '\b((? + github.event.pull_request.merged + && ( + github.event.action == 'closed' + || ( + github.event.action == 'labeled' + && contains(github.event.label.name, 'backport') + ) + ) + steps: + - name: GitHub App token + id: github_app_token + uses: tibdex/github-app-token@v1.5.0 + with: + app_id: ${{ secrets.APP_ID }} + private_key: ${{ secrets.APP_PRIVATE_KEY }} + # opensearch-trigger-bot installation ID + installation_id: 22958780 + + - name: Backport + uses: VachaShah/backport@v2.1.0 + with: + github_token: ${{ steps.github_app_token.outputs.token }} + head_template: backport/backport-<%= number %>-to-<%= base %> diff --git a/.github/workflows/delete_backport_branch.yml b/.github/workflows/delete_backport_branch.yml new file mode 100644 index 00000000..387a124b --- /dev/null +++ b/.github/workflows/delete_backport_branch.yml @@ -0,0 +1,15 @@ +name: Delete merged branch of the backport PRs +on: + pull_request: + types: + - closed + +jobs: + delete-branch: + runs-on: ubuntu-latest + if: startsWith(github.event.pull_request.head.ref,'backport/') + steps: + - name: Delete merged branch + uses: SvanBoxel/delete-merged-branch@main + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/jekyll-build.yml b/.github/workflows/jekyll-build.yml new file mode 100644 index 00000000..deb574d6 --- /dev/null +++ b/.github/workflows/jekyll-build.yml @@ -0,0 +1,16 @@ +name: Jekyll Build Verification + +on: [pull_request] + +jobs: + check: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.2' + bundler-cache: true + - run: | + JEKYLL_FATAL_LINK_CHECKER=internal bundle exec jekyll build --future diff --git a/.github/workflows/link-checker.yml b/.github/workflows/link-checker.yml new file mode 100644 index 00000000..2e8c63a8 --- /dev/null +++ b/.github/workflows/link-checker.yml @@ -0,0 +1,26 @@ +name: Check Links +on: + workflow_dispatch: + schedule: + - cron: "30 11 * * 0" +jobs: + check: + if: github.repository == 'opensearch-project/documentation-website' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.0' + bundler-cache: true + - run: | + JEKYLL_FATAL_LINK_CHECKER=all bundle exec jekyll build --future + - name: Create Issue On Build Failure + if: ${{ failure() }} + uses: dblock/create-a-github-issue@v3 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + WORKFLOW_URL: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + with: + update_existing: true + filename: .github/ISSUE_TEMPLATE/broken_links.md diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml new file mode 100644 index 00000000..2eee5d82 --- /dev/null +++ b/.github/workflows/vale.yml @@ -0,0 +1,23 @@ +name: Style check + +on: + pull_request: + workflow_dispatch: + +jobs: + style-job: + runs-on: ubuntu-latest + steps: + - name: Check out + uses: actions/checkout@v3 + + - name: Run Vale + uses: errata-ai/vale-action@reviewdog + env: + GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} + with: + fail_on_error: false + reporter: github-pr-check + filter_mode: added + vale_flags: "--no-exit" + version: 2.28.0 \ No newline at end of file diff --git a/.vale.ini b/.vale.ini new file mode 100644 index 00000000..2fb470b9 --- /dev/null +++ b/.vale.ini @@ -0,0 +1,70 @@ +StylesPath = ".github/vale/styles" +Vocab = "OpenSearch" +MinAlertLevel = warning +SkippedScopes = code, style + +[*.md] +BasedOnStyles = Vale, OpenSearch + +BlockIgnores = {%-?\s*comment[.|\s|\S]*?endcomment\s*-?%}, \ + {%\s*raw[.|\s|\S]*?endraw\s*%}, \ + {:+\s*[\.\w-\s]*\s*}, \ + {%\s+[^%]*%} + +# ignore variables +TokenIgnores = [a-zA-Z_]+((?:_|\.)[a-zA-Z]+)+ + +# override Vale spelling +Vale.Spelling = NO +Vale.Repetition = NO +Vale.Terms = YES +OpenSearch.AcronymParentheses = YES +OpenSearch.Ampersand = YES +OpenSearch.Cyber = YES +OpenSearch.DashSpacing = YES +OpenSearch.DirectionAboveBelow = YES +OpenSearch.DirectionTopBottom = YES +OpenSearch.Exclamation = YES +OpenSearch.FailoverNoun = YES +OpenSearch.FailoverVerb = YES +OpenSearch.FutureTense = NO +OpenSearch.HeadingAcronyms = YES +OpenSearch.HeadingCapitalization = YES +OpenSearch.HeadingColon = YES +OpenSearch.HeadingPunctuation = YES +OpenSearch.Inclusive = YES +OpenSearch.LatinismsElimination = YES +OpenSearch.LatinismsSubstitution = YES +OpenSearch.LinksDoubleParentheses = YES +OpenSearch.LinksDoubleSlash = YES +OpenSearch.LinksEndSlash = YES +OpenSearch.LinksMidSlash = YES +OpenSearch.LoginNoun = YES +OpenSearch.LoginVerb = YES +OpenSearch.LogoutNoun = YES +OpenSearch.LogoutVerb = YES +OpenSearch.MergeConflicts = YES +OpenSearch.OxfordComma = YES +OpenSearch.PassiveVoice = NO +OpenSearch.Please = YES +OpenSearch.Range = YES +OpenSearch.Repetition = YES +OpenSearch.RolloverNoun = YES +OpenSearch.RolloverVerb = YES +OpenSearch.SetupNoun = YES +OpenSearch.SetupVerb = YES +OpenSearch.SignatureV4 = YES +OpenSearch.Simple = YES +OpenSearch.SpacingPunctuation = YES +OpenSearch.SpacingSlash = YES +OpenSearch.SpacingWords = YES +OpenSearch.Spelling = YES +OpenSearch.StackedHeadings = YES +OpenSearch.SubstitutionsError = YES +OpenSearch.SubstitutionsSuggestion = YES +OpenSearch.TableHeadings = YES +OpenSearch.TimeoutNoun = YES +OpenSearch.TimeoutVerb = YES +OpenSearch.UnitsNames = YES +OpenSearch.UnitsSpacing = YES +OpenSearch.Version = YES \ No newline at end of file diff --git a/404.md b/404.md index f21d3a51..60a1bc88 100644 --- a/404.md +++ b/404.md @@ -1,3 +1,19 @@ --- permalink: /404.html +title: 404 +layout: default +heading_anchors: false +nav_exclude: true --- + +## Oops, this isn't the page you're looking for. + +Maybe our [home page](https://opensearch.org/docs/latest) or one of the commonly visited pages below will help. If you need further support, please use the feedback feature on the right side of the screen to get in touch. + +- [Quickstart]({{site.url}}{{site.baseurl}}/quickstart/) +- [Installing OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/) +- [OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/) +- [Query DSL]({{site.url}}{{site.baseurl}}/query-dsl/) +- [API Reference]({{site.url}}{{site.baseurl}}/api-reference/index/) + + diff --git a/API_STYLE_GUIDE.md b/API_STYLE_GUIDE.md new file mode 100644 index 00000000..6dc40df0 --- /dev/null +++ b/API_STYLE_GUIDE.md @@ -0,0 +1,197 @@ +# API Style Guide + +This guide provides the basic structure for creating OpenSearch API documentation. It includes the various elements that we feel are most important to creating complete and useful API documentation, as well as description and examples where appropriate. + +Depending on the intended purpose of the API, *some sections will be required while others may not be applicable*. + +Use the [API_TEMPLATE](templates/API_TEMPLATE.md) to create an API documentation page. + +### A note on terminology ### + +Terminology for API parameters varies in the software industry, where two or even three names may be used to label the same type of parameter. For consistency, we use the following nomenclature for parameters in our API documentation: +* *Path parameter* – "path parameter" and "URL parameter" are sometimes used synonymously. To avoid confusion, we use "path parameter" in this documentation. +* *Query parameter* – This parameter name is often used synonymously with "request parameter." We use "query parameter" to be consistent. + +### General usage for code elements + +When you describe any code element in a sentence, such as an API, a parameter, or a field, you can use the noun name. + *Example usage*: + The time field provides a timestamp for job completion. + +When you provide an exact example with a value, you can use the code element in code font. + *Example usage*: + The response provides a value for `time_field`, such as “timestamp.” + +Provide a REST API call example in `json` format. Optionally, also include the `curl` command if the call can only be executed in a command line. + +## Basic elements for documentation + +The following sections describe the basic API documentation structure. Each section is discussed under its respective heading. Include only those elements appropriate to the API. + +Depending on where the documentation appears within a section or subsection, heading levels may be adjusted to fit with other content. + +1. Name of API (heading level 2) +1. (Optional) Path and HTTP methods (heading level 3) +1. Path parameters (heading level 3) +1. Query parameters (heading level 3) +1. Request fields (heading level 3) +1. Example request (heading level 4) +1. Example response (heading level 4) +1. Response fields (heading level 3) + +## API name + +Provide an API name that describes its function, followed by a description of its top use case and any usage recommendations. + +*Example function*: "Autocomplete queries" + +Use sentence capitalization for the heading (for example, "Create or update mappings"). When you refer to the API operation, you can use lowercase with code font. + +If there is a corresponding OpenSearch Dashboards feature, provide a “See also” link that references it. +*Example*: “To learn more about monitor findings, see [Document findings](https://opensearch.org/docs/latest/monitoring-plugins/alerting/monitors/#document-findings)." + +If applicable, provide any caveats to its usage with a note or tip, as in the following example: + +"If you use the Security plugin, make sure you have the appropriate permissions." +(To set this point in note-style format, follow the text on the next line with {: .note}) + +### Path and HTTP methods + +For relatively complex API calls that include path parameters, it's sometimes a good idea to provide an example so that users can visualize how the request is properly formed. This section is optional and includes examples that illustrate how the endpoint and path parameters fit together in the request. The following is an example of this section for the nodes stats API: + +```json +GET /_nodes/stats +GET /_nodes//stats +GET /_nodes/stats/ +GET /_nodes//stats/ +GET /_nodes/stats// +GET /_nodes//stats// +``` + +### Path parameters + +While the API endpoint states a point of entry to a resource, the path parameter acts on the resource that precedes it. Path parameters come after the resource name in the URL. + +In the following example, the resource is `scroll` and its path parameter is ``: + +```json +GET _search/scroll/ +``` + +Introduce what the path parameters can do at a high level. Provide a table with parameter names and descriptions. Include a table with the following columns: +*Parameter* – Parameter name in plain font. +*Data type* – Data type capitalized (such as Boolean, String, or Integer). +*Description* – Sentence to describe the parameter function, default values or range of values, and any usage examples. + +Parameter | Data type | Description +:--- | :--- | :--- + +### Query parameters + +In terms of placement, query parameters are always appended to the end of the URL and located to the right of the operator "?". Query parameters serve the purpose of modifying information to be retrieved from the resource. + +In the following example, the endpoint is `aliases` and its query parameter is `v` (provides verbose output): + +```json +GET _cat/aliases?v +``` + +Include a paragraph that describes how to use the query parameters with an example in code font. Include the query parameter operator "?" to delineate query parameters from path parameters. + +For GET and DELETE APIs: Introduce what you can do with the optional parameters. Include a table with the same columns as the path parameter table. + +Parameter | Data type | Description +:--- | :--- | :--- + +### Request fields + +For PUT and POST APIs: Introduce what the request fields are allowed to provide in the body of the request. + +Include a table with these columns: +*Field* – Field name in plain font. +*Data type* – Data type capitalized (such as Boolean, String, or Integer). +*Description* – Sentence to describe the field’s function, default values or range of values, and any usage examples. + +Field | Data type | Description +:--- | :--- | :--- + +#### Example request + +Provide a sentence that describes what is shown in the example, followed by a cut-and-paste-ready API request in JSON format. Make sure that you test the request yourself in the Dashboards Dev Tools console to make sure it works. See the following examples. + +The following request gets all the settings in your index: + +```json +GET /sample-index1/_settings +``` + +The following request copies all of your field mappings and settings from a source index to a destination index: + +```json +POST _reindex +{ + "source":{ + "index":"sample-index-1" + }, + "dest":{ + "index":"sample-index-2" + } +} +``` + +#### Example response + +Include a JSON example response to show what the API returns. See the following examples. + +The `GET /sample-index1/_settings` request returns the following response fields: + +```json +{ + "sample-index1": { + "settings": { + "index": { + "creation_date": "1622672553417", + "number_of_shards": "1", + "number_of_replicas": "1", + "uuid": "GMEA0_TkSaamrnJSzNLzwg", + "version": { + "created": "135217827", + "upgraded": "135238227" + }, + "provided_name": "sample-index1" + } + } + } +} +``` + +The `POST _reindex` request returns the following response fields: + +```json +{ + "took" : 4, + "timed_out" : false, + "total" : 0, + "updated" : 0, + "created" : 0, + "deleted" : 0, + "batches" : 0, + "version_conflicts" : 0, + "noops" : 0, + "retries" : { + "bulk" : 0, + "search" : 0 + }, + "throttled_millis" : 0, + "requests_per_second" : -1.0, + "throttled_until_millis" : 0, + "failures" : [ ] +} +``` + +### Response fields + +For PUT and POST APIs: Define all allowable response fields that can be returned in the body of the response. + +Field | Data type | Description +:--- | :--- | :--- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c4b6a1c5..26a966b0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,59 +1,144 @@ -# Contributing Guidelines +- [Creating an issue](#creating-an-issue) +- [Contributing content](#contributing-content) + - [Contribution workflow](#contribution-workflow) + - [Before you start](#before-you-start) + - [Making minor changes](#making-minor-changes) + - [Making major changes](#making-major-changes) + - [Setting up your local copy of the repository](#setting-up-your-local-copy-of-the-repository) + - [Making, viewing, and submitting changes](#making-viewing-and-submitting-changes) +- [Review process](#review-process) + - [Style linting](#style-linting) +- [Getting help](#getting-help) -Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional -documentation, we greatly value feedback and contributions from our community. +# Contributing guidelines -Please read through this document before submitting any issues or pull requests to ensure we have all the necessary -information to effectively respond to your bug report or contribution. +Thank you for your interest in improving the OpenSearch documentation! We value and appreciate all feedback and contributions from our community, including requests for additional documentation, corrections to existing content, and reports of technical issues with the documentation site. +You can [create an issue](#creating-an-issue) asking us to change the documentation or [contribute content](#contributing-content) yourself. -## Reporting Bugs/Feature Requests +NOTE: If you’d like to contribute but don't know where to start, try browsing existing [issues](https://github.com/opensearch-project/documentation-website/issues). Our projects use custom GitHub issue labels for status, version, type of request, and so on. We recommend starting with any issue labeled "good first issue" if you're a beginner or "help wanted" if you're a more experienced user. -We welcome you to use the GitHub issue tracker to report bugs or suggest features. +## Creating an issue -When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already -reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: +Use the documentation issue template to describe the change you'd like to make: -* A reproducible test case or series of steps -* The version of our code being used -* Any modifications you've made relevant to the bug -* Anything unusual about your environment or deployment +1. Go to https://github.com/opensearch-project/documentation-website/issues and select **New issue**. +1. Enter the requested information, including as much detail as possible, especially which version or versions the request affects. +1. Select **Submit new issue**. +The `untriaged` label is assigned automatically. During the triage process, the Documentation team will add the appropriate labels, assign the issue to a technical writer, and prioritize the request. We may follow up with you for additional information. -## Contributing via Pull Requests -Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: +## Contributing content -1. You are working against the latest source on the *main* branch. -2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. -3. You open an issue to discuss any significant work - we would hate for your time to be wasted. +There are two ways to contribute content, depending on the magnitude of the change: -To send us a pull request, please: +- [Minor changes](#making-minor-changes): For small changes to existing files, like fixing typos or adding parameters, you can edit files in GitHub directly. This approach does not require cloning the repository and does not allow you to test the documentation. +- [Major changes](#making-major-changes): For changes you want to test first, like adding new or reorganizing pages or adding a table or section, you can edit files locally and push the changes to GitHub. This approach requires setting up a local version of the repository and allows you to test the documentation. -1. Fork the repository. -2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. -3. Ensure local tests pass. -4. Commit to your fork using clear commit messages. -5. Send us a pull request, answering any default questions in the pull request interface. -6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. +### Contribution workflow -GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and -[creating a pull request](https://help.github.com/articles/creating-a-pull-request/). +The workflow for contributing documentation is the same as the one for contributing code: +- Make your changes. +- Build the documentation website to check your work (only possible if you are making changes locally). +- Submit a [pull request](https://github.com/opensearch-project/documentation-website/pulls) (PR). +- A maintainer reviews and merges your PR. -## Finding contributions to work on -Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. +### Before you start +Before contributing content, make sure to read the following resources: +- [README](README.md) +- [OpenSearch Project Style Guidelines](STYLE_GUIDE.md) +- [API Style Guide](API_STYLE_GUIDE.md) +- [Formatting Guide](FORMATTING_GUIDE.md) -## Code of Conduct -This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). -For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact -opensource-codeofconduct@amazon.com with any additional questions or comments. +NOTE: Make sure that any documentation you submit is your own work or work that you have the right to submit. We respect the intellectual property rights of others, and as part of contributing, we'll ask you to sign your contribution with a [Developer Certificate of Origin (DCO)](https://github.com/opensearch-project/.github/blob/main/CONTRIBUTING.md#developer-certificate-of-origin) stating that you have the right to submit your contribution and that you understand that we will use your contribution. +### Making minor changes -## Security issue notifications -If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. +If you want to make minor changes to an existing file, you can use this approach: +1. [Fork this repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo). -## Licensing +1. In your fork on GitHub, navigate to the file that you want to change. -See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. +1. In the upper-right corner, select the pencil icon and edit the file. + +1. In the upper-right corner, select **Commit changes...***. Enter the commit message and optional description and select **Create a new branch for this commit and start a pull request**. + +### Making major changes + +If you're adding a new page or making major changes to the documentation, such as adding new images, sections, or styling, we recommend that you work in a local copy of the repository and test the rendered HTML before submitting a PR. + +#### Setting up your local copy of the repository + +Follow these steps to set up your local copy of the repository: + +1. [Fork this repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo) and clone your fork. + +1. Navigate to your cloned repository. + +1. Install [Ruby](https://www.ruby-lang.org/en/) if you don't already have it. We recommend [RVM](https://rvm.io/), but you can use any method you prefer: + + ``` + curl -sSL https://get.rvm.io | bash -s stable + rvm install 3.2 + ruby -v + ``` + +1. Install [Bundler](https://bundler.io/) if you don't already have it: + + ``` + gem install bundler + ``` + +1. Install Jekyll and all the dependencies: + + ``` + bundle install + ``` + +#### Making, viewing, and submitting changes + +Here's how to build the website, make changes, and view them locally: + +1. Build the website: + + ``` + sh build.sh + ``` + + The build script should automatically open your web browser, but if it doesn't, open [http://localhost:4000/docs/](http://localhost:4000/docs/). + +1. Create a new branch against the latest source on the main branch. + +1. Edit the Markdown files that you want to change. + +1. When you save a file, Jekyll automatically rebuilds the site and refreshes your web browser. This process can take 60--90 seconds. + +1. When you're happy with how everything looks, commit, [sign off](https://github.com/src-d/guide/blob/9171d013c648236c39faabcad8598be3c0cf8f56/developer-community/fix-DCO.md#how-to-prevent-missing-sign-offs-in-the-future), push your changes to your fork, and submit a PR. + + Note that a PR requires DCO sign-off before we can merge it. You can use the -s command line option to append this automatically to your commit message, for example, `git commit -s -m 'This is my commit message'`. For more information, see https://github.com/apps/dco. + +## Review process + +We greatly appreciate all contributions to the documentation and will review them as quickly as possible. + +During the PR process, expect that there will be some back-and-forth. If you want your contribution to be merged quickly, try to respond to comments in a timely fashion, and let us know if you don't want to continue with the PR. + +We use the [Vale](https://github.com/errata-ai/vale) linter to ensure that our documentation adheres to the [OpenSearch Project Style Guidelines](STYLE_GUIDE.md). Addressing Vale comments on the PR expedites the review process. You can also install Vale locally so you can address the comments before creating a PR. For more information, see [Style linting](#style-linting). + +If we accept the PR, we will merge it and will backport it to the appropriate branches. + +### Style linting + +To ensure that our documentation adheres to the [OpenSearch Project Style Guidelines](STYLE_GUIDE.md), we use the [Vale](https://github.com/errata-ai/vale) linter. Addressing Vale comments on the PR expedites the review process. You can also install Vale locally as follows so you can address the comments before creating a PR: + +1. Run `brew install vale`. +2. Run `vale *` from the documentation site root directory to lint all Markdown files. To lint a specific file, run `vale /path/to/file`. + +Optionally, you can install the [Vale VSCode](https://github.com/chrischinchilla/vale-vscode) extension, which integrates Vale with Visual Studio Code. By default, only _errors_ and _warnings_ are underlined. To change the minimum alert level to include _suggestions_, go to **Vale VSCode** > **Extension Settings** and select **suggestion** in the **Vale > Vale CLI: Min Alert Level** dropdown list. + +## Getting help + +For help with the contribution process, reach out to one of the [points of contact](README.md#points-of-contact). diff --git a/FORMATTING_GUIDE.md b/FORMATTING_GUIDE.md new file mode 100644 index 00000000..ea5f7117 --- /dev/null +++ b/FORMATTING_GUIDE.md @@ -0,0 +1,470 @@ +# Formatting Guide + +This guide provides an overview of the formatted elements commonly used in the OpenSearch documentation. + +* * * + +### Table of contents + +* [Adding pages or sections](#adding-pages-or-sections) +* [Buttons](#buttons) +* [Callouts](#callouts) +* [Collapsible blocks](#collapsible-blocks) +* [Dashes](#dashes) +* [Horizontal rule](#horizontal-rule) +* [Images](#images) + * [Images in line with text](#images-in-line-with-text) +* [Labels](#labels) +* [Links](#links) +* [Lists](#lists) + * [Unordered lists](#unordered-lists) + * [Ordered lists](#ordered-lists) + * [Nested lists](#nested-lists) + * [Lists with code snippets or images](#lists-with-code-snippets-or-images) +* [Math](#math) +* [Tables](#tables) +* [Text style](#text-style) +* [Variables in curly braces](#variables-in-curly-braces) +* [Videos](#videos) + +* * * + +## Adding pages or sections + +This repository contains [Markdown](https://guides.github.com/features/mastering-markdown/) files organized into Jekyll _collections_ (for example, `_api-reference` or `_dashboards`). Each Markdown file corresponds to one page on the website. + +In addition to the content for a given page, each Markdown file contains some Jekyll [front matter](https://jekyllrb.com/docs/front-matter/) similar to the following: + +``` +--- +layout: default +title: Date +nav_order: 25 +has_children: false +parent: Date field types +grand_parent: Supported field types +--- +``` + +If you want to reorganize content or add a new page, make sure to set the appropriate `has_children`, `parent`, `grand_parent`, and `nav_order` variables, which define the hierarchy of pages in the left navigation. + +When adding a page or a section, make the `nav_order` of the child pages multiples of 10. For example, if you have a parent page `Clients`, make child pages `Java`, `Python`, and `JavaScript` have a `nav_order` of 10, 20, and 30, respectively. Doing so makes inserting additional child pages easier because it does not require you to renumber existing pages. + +Each collection must have an `index.md` file that corresponds to the collection's index page. In the `index.md` file's front matter, specify `nav_excluded: true` so that the page does not appear separately under the collection. + +## Buttons + +You can use either `copy` or `copy-curl` includes for code snippets. The `copy` include places a **Copy** button on the code snippet, while the `copy-curl` include places both **Copy** and **Copy as cURL** buttons. Use the `copy-curl` include for API requests. If an API request is already in the cURL format, use the `copy` include. + +**Example of a `copy` include** + +```` +```bash +curl -XGET "localhost:9200/_tasks?actions=*search&detailed +``` +{% include copy.html %} +```` + +**Example of a `copy-curl` include** + +```` +```json +PUT /sample-index1/_clone/cloned-index1 +{ + "aliases": { + "sample-alias1": {} + } +} +``` +{% include copy-curl.html %} +```` + +## Callouts + +You can use four levels of callouts: + +* `{: .note}` blue +* `{: .tip }` green +* `{: .important}` yellow +* `{: .warning}` red + +Place a callout directly under the paragraph to which you want to apply the callout style. + +**Example** + +``` +In case of a cluster or node failure, all PIT data is lost. +{: .note} +``` + +For a callout with multiple paragraphs or lists, use `>`: + +``` +> ****PREREQUISITE**** +> +> To use a custom vector map with GeoJSON, install these two required plugins: +> * OpenSearch Dashboards Maps [`dashboards-maps`](https://github.com/opensearch-project/dashboards-maps_) front-end plugin +> * OpenSearch [`geospatial`](https://github.com/opensearch-project/geospatial_) backend plugin +{: .note} + +``` + +## Collapsible blocks + +To insert an open collapsible block, use the `
` element as follows: + +````html +
+ + Response + + {: .text-delta} + +```json +{ + "_nodes" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + } +} +``` +
+```` + +To insert a closed collapsible block, omit the `open` state: + +````html +
+ + Response + + {: .text-delta} + +```json +{ + "_nodes" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + } +} +``` +
+```` + +Collapsible blocks are useful for long responses and for the Table of Contents at the beginning of a page. + +## Dashes + +Use one dash for hyphens, two for en dashes, and three for em dashes: + +``` +upper-right +10--12 nodes per cluster +There is one candidate generator available---`direct_generator`. +``` + +## Horizontal rule + +A horizontal rule is used to separate text sections. Use three asterisks separated by spaces for a horizontal rule: + +``` +## Why use OpenSearch? + +* * * +``` + +## Images + +Place images in the `images` directory of the documentation website. To refer to images, use relative links (see [Internal links](#internal-links) for more information). + +Markdown images are responsive by default. To insert a Markdown image, use the `![](link)` syntax: + +``` +![OS branding]({{site.url}}{{site.baseurl}}/images/brand.png) +``` + +Markdown uses the image’s actual width to render it. It sets the maximum image width to the width of the main body panel. + +If you want to specify the image width or another style, use HTML syntax: + +``` +OS branding +``` + +You can specify width as a hard-coded number of pixels, as in the preceding example, or as a percentage of the parent width: + +``` +OS branding +``` + +To stretch the image to fit the width of the main body panel, use width=“100%”. + +To take high-resolution screenshots, in Firefox, right-click on the page and choose “Take Screenshot”. + +Image borders are automatic; do not manually add a border to an image. + +Always **separate an image from the text with a blank line**: + +``` +To send a query to OpenSearch, select the query by placing the cursor anywhere in the query text. Then choose the triangle on the top right of the request or press `Ctrl/Cmd+Enter`: + +Send request +``` + +Do not place an image next to text or insert artificial line breaks using `
`. Otherwise, the text might render as aligned to the bottom of the image, with the image on the right. + +If the image is under a list item, place it on a new line with a tab. For more examples, see [Lists with code snippets or images](#lists-with-code-snippets-or-images). + +### Images in line with text + +When describing an icon, use the icon's name followed by an inline image in parentheses. Insert the image in line with text using the `nomarkdown` extension and an HTML image: + +``` +Choose the play icon ({::nomarkdown}play icon{:/}) on the upper right of the request. +``` + +## Labels + +You can use the following labels: + +* label-blue +* label-green +* label-purple +* label-red +* label-yellow + +Use a purple label to specify the version in which an API was introduced: + +``` +# Alias +Introduced 1.0 +{: .label .label-purple } +``` + +If we introduce a breaking change to an operation, add an additional label with a link to the release note for that breaking change: + +``` +## Get roles +Introduced 1.0 +{: .label .label-purple } +[Last breaking change 2.0](https://example.com) +{: .label .label-red } +``` + +## Links + +To add a link to a document, section, or image, use the `[name](link)` syntax, for example: + +``` +## Looking for the Javadoc? + +See [opensearch.org/javadocs/](https://opensearch.org/javadocs/). +``` + +### Section links + +**Section links** are links to headings in your document. Markdown lowercases the headings for links, drops back ticks, and replaces spaces with hyphens: + +``` +## The `minimum_should_match` parameter + +For more information, see [the `minimum_should_match` parameter](#the-minimum_should_match-parameter). +``` + +### Internal links + +**Internal links** are links to another document or image within the documentation website. Because the documentation website is versioned, do not hard code the version number in the link. Use the relative path, where `{{site.url}}{{site.baseurl}}` refers to the main directory, instead: + +``` +If you need to use a field for exact-value search, map it as a [`keyword`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/keyword/). +``` + +### GitHub links + +When linking to a GitHub issue or PR, refer to the issue or PR number in the following format: + +``` +For more details, see issue [#1940](https://github.com/opensearch-project/opensearch/issues/1940). +``` + +## Lists + +Markdown supports unordered and ordered lists, nested lists, and lists with code snippets or images. + +### Unordered lists + +Use asterisks or dashes for unordered lists: + +``` +* One +* Two +``` + +or + +``` +- One +- Two +``` + +Lists with dashes render the list items closer to each other vertically, while lists with asterisks have more space between the lines. + +Don’t mix and match asterisks and dashes. + +### Ordered lists + +Use all 1s for ordered lists: + +``` +1. One +1. Two +``` + +Jekyll will automatically correctly number the items, and it will be much easier for you to insert and delete items without renumbering. + +If there is a paragraph in the middle of a list, the list will restart with 1 after the paragraph. If you want to continue the list after the paragraph, use `counter-reset: none`: + +``` +1. One + +Paragraph that breaks the numbering + +{:style="counter-reset: none"} +1. Two +``` + +### Nested lists + +Use tabs to nest lists: + +``` +1. Parent 1 + - Child 1 + - Child 2 + - Grandchild 1 +``` + +Markdown automatically adjusts numbered lists so that they use numbers and letters, so always use 1s for nested numbered lists. + +### Lists with code snippets or images + +If you need to position an image or a code snippet within a list, use tabs to signal to Markdown that the image or code snippet is part of the list item. + +**Example with code snippets** + +``` +1. Run the demo batch script. + There are two ways of running the batch script: + 1. Run the batch script using the Windows UI: + 1. Navigate to the top directory of your OpenSearch installation and open the `opensearch-{{site.opensearch_version}}` folder. + 1. Run the batch script by double-clicking the `opensearch-windows-install.bat` file. This opens a command prompt with an OpenSearch instance running. + 1. Run the batch script from Command prompt or Powershell: + 1. Open Command Prompt by entering `cmd`, or Powershell by entering `powershell`, in the search box next to ****Start**** on the taskbar. + 1. Change to the top directory of your OpenSearch installation. + ```bat + cd \path\to\opensearch-{{site.opensearch_version}} + ``` + 1. Run the batch script. + ```bat + .\opensearch-windows-install.bat + ``` +``` + +**Example with images** + +``` +1. To begin, select the rule in the **Rule name** column. The rule details pane opens, as shown in the following image. + Opening the rule details pane + +1. Select the **Duplicate** button in the upper-right corner of the pane. The **Duplicate rule** window opens in Visual Editor view, and all of the fields are automatically populated with the rule's details. Details are also populated in YAML Editor view, as shown in the following image. + Selecting the duplicate button opens the Duplicate rule window +``` + +## Math + +To add mathematical expressions to a page, add `has_math: true` to the page’s front matter. Then insert LaTeX math into HTML tags with the rest of your Markdown content, as shown in the following example: + +``` +## Math + +Some Markdown paragraph. Here's a formula: + +

+ When \(a \ne 0\), there are two solutions to \(ax^2 + bx + c = 0\) and they are + \[x = {-b \pm \sqrt{b^2-4ac} \over 2a}.\] +

+ +And back to Markdown. +``` + +## Tables + +Markdown table columns are automatically sized, and there is no need to specify a different number of dashes in the formatting. + +**Example** + +``` +Header 1 | Header 2 +:--- | :--- +Body 1 | Body 2, which is extremely lengthy, but there is no need to specify its width. +``` + +To insert line breaks within tables, use `
`: + +``` +Header 1 | Header 2 +:--- | :--- +Body 1 | Body paragraph 1
Body paragraph 2 +``` + +To use lists within a table, use `
` and `-` : + +``` +Header 1 | Header 2 +:--- | :--- +Body 1 | List:
- One
- Two +``` + + +You can also use ` ` to insert one space, ` ` to insert two spaces, and ` ` to insert four spaces in table cells. + +If you need a list with real bullet points, use the bullet point HTML code: + +``` +Header 1 | Header 2 +:--- | :--- +Body 1 | List:
 • One
 • Two +``` + +## Text style + +You can style text in the following ways: + +* ```**bold**``` +* ```_italic_``` or ```*italic*``` + +For guidance on using code examples and when to use code font, see [Code examples](https://github.com/opensearch-project/documentation-website/blob/main/STYLE_GUIDE.md#code-examples). + +## Variables in curly braces + +To correctly display variables that are in curly braces, escape the curly braces with the `{% raw %}{% endraw %}` tags: + +```` +"message_template": { + "source": "the index is {% raw %}{{ctx.index}}{% endraw %}" +} +```` + +The variable `ctx.index` is rendered in double curly braces. + +## Videos + +To insert a video, add a YouTube player include similar to the following: + +``` +{% include youtube-player.html id='_g46WiGPhFs' %} +``` + +Note that the `id` variable refers to the YouTube video ID at the end of the URL. For example, the YouTube video at the URL `https://youtu.be/_g46WiGPhFs` has the ID `_g46WiGPhFs`. The ID must be surrounded with single quotation marks. diff --git a/Gemfile b/Gemfile index d11a3ddc..830dc19d 100644 --- a/Gemfile +++ b/Gemfile @@ -8,7 +8,7 @@ source "https://rubygems.org" # # This will help ensure the proper Jekyll version is running. # Happy Jekylling! -gem "jekyll", "~> 4.2.0" +gem "jekyll", "~> 4.3.2" # This is the default theme for new Jekyll sites. You may change this to anything you like. gem "just-the-docs", "~> 0.3.3" @@ -22,6 +22,7 @@ gem "jekyll-redirect-from", "~> 0.16" # If you have any plugins, put them here! group :jekyll_plugins do + gem "jekyll-last-modified-at" gem "jekyll-sitemap" end @@ -31,4 +32,11 @@ gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby] # Performance-booster for watching directories on Windows gem "wdm", "~> 0.1.0" if Gem.win_platform? +# Installs webrick dependency for building locally gem "webrick", "~> 1.7" + + +# Link checker +gem "typhoeus" +gem "ruby-link-checker" +gem "ruby-enum" diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 00000000..921e46ab --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,16 @@ +## Overview + +This document contains a list of maintainers in this repo. See [opensearch-project/.github/RESPONSIBILITIES.md](https://github.com/opensearch-project/.github/blob/main/RESPONSIBILITIES.md#maintainer-responsibilities) that explains what the role of maintainer means, what maintainers do in this and other repos, and how they should be doing it. If you're interested in contributing, and becoming a maintainer, see [CONTRIBUTING](CONTRIBUTING.md). + +## Current Maintainers + +| Maintainer | GitHub ID | Affiliation | +| ---------------- | ----------------------------------------------- | ----------- | +| Heather Halter | [hdhalter](https://github.com/hdhalter) | Amazon | +| Fanit Kolchina | [kolchfa-aws](https://github.com/kolchfa-aws) | Amazon | +| Nate Archer | [Naarcha-AWS](https://github.com/Naarcha-AWS) | Amazon | +| Nate Bower | [natebower](https://github.com/natebower) | Amazon | +| Melissa Vagi | [vagimeli](https://github.com/vagimeli) | Amazon | +| Miki Barahmand | [AMoo-Miki](https://github.com/AMoo-Miki) | Amazon | +| David Venable | [dlvenable](https://github.com/dlvenable) | Amazon | +| Stephen Crawford | [scraw99](https://github.com/scrawfor99) | Amazon | diff --git a/README.md b/README.md index 1133c125..1faa53a5 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ 社区成员的贡献对保持本文档的完整性,有效性,完整组织和保持最新起到了非常重要的作用。 +- Do you work on one of the various OpenSearch plugins? Take a look at the documentation for the plugin. Is everything accurate? Will anything change in the near future? ## 你可以做什么样的帮助 @@ -297,12 +298,12 @@ And back to Markdown. ## 安全特性(Security) -请参考 [参与项目(CONTRIBUTING)](CONTRIBUTING.md#security-issue-notifications) 页面中的内容来获得更多信息。 +If you discover a potential security issue in this project, we ask that you notify AWS/Amazon Security using our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Do **not** create a public GitHub issue. ## 许可证(License) -This project is licensed under the Apache-2.0 License。 +This project is licensed under the Apache-2.0 License. ## 版权(Copyright) diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md new file mode 100644 index 00000000..d0602c92 --- /dev/null +++ b/STYLE_GUIDE.md @@ -0,0 +1,510 @@ +# OpenSearch Project Style Guidelines + +Welcome to the content style guide for the OpenSearch Project. This guide covers the style standards to be observed when creating OpenSearch content and will evolve as we implement best practices and lessons learned in order to best serve the community. + +In addition to this guide and [TERMS.md](https://github.com/opensearch-project/documentation-website/blob/main/TERMS.md), our content is generally edited in accordance with the [Microsoft Writing Style Guide](https://docs.microsoft.com/en-us/style-guide/welcome/), [The Chicago Manual of Style](https://www.chicagomanualofstyle.org/home.html), and [Merriam-Webster](https://www.merriam-webster.com/) (listed in order of precedence); however, we may deviate from these style guides in order to maintain consistency and accommodate the unique needs of the community. This is by no means an exhaustive list of style standards, and we value transparency, so we welcome contributions to our style standards and guidelines. If you have a question regarding our standards or adherence/non-adherence to the style guides or would like to make a contribution, please tag @natebower on GitHub. + +## Naming conventions, voice, tone, and brand personality traits + +The following sections provide guidance on OpenSearch Project naming conventions, voice, tone, and brand personality traits. + +### Naming conventions + +The following naming conventions should be observed in OpenSearch Project content: + +* Capitalize both words when referring to the *OpenSearch Project*. +* *OpenSearch* is the name for the distributed search and analytics engine used by Amazon OpenSearch Service. +* Amazon OpenSearch Service is a managed service that makes it easy to deploy, operate, and scale OpenSearch. Use the full name *Amazon OpenSearch Service* on first appearance. The abbreviated service name, *OpenSearch Service*, can be used for subsequent appearances. +* Amazon OpenSearch Serverless is an on-demand serverless configuration for Amazon OpenSearch Service. Use the full name *Amazon OpenSearch Serverless* on first appearance. The abbreviated service name, *OpenSearch Serverless*, can be used for subsequent appearances. +* OpenSearch Dashboards is the UI for OpenSearch. On first appearance, use the full name *OpenSearch Dashboards*. *Dashboards* can be used for subsequent appearances. +* *Security Analytics* is a security information and event management (SIEM) solution for OpenSearch. Capitalize both words when referring to the name of the solution. +* Observability is collection of plugins and applications that let you visualize data-driven events by using Piped Processing Language (PPL). Capitalize *Observability* when referring to the name of the solution. +* Refer to OpenSearch Project customers as *users*, and refer to the larger group of users as *the community*. Do not refer to the OpenSearch Project or to the AWS personnel working on the project as a *team*, as this implies differentiation within the community. + +#### Product names + +Capitalize product names. The OpenSearch Project has three products: OpenSearch, OpenSearch Dashboards, and Data Prepper. For example: + +* “To install *OpenSearch*, download the Docker image.” +* “To access *OpenSearch Dashboards*, open your browser and navigate to http://localhost:5601/app/home.” +* “*Data Prepper* contains the following components:” + +Capitalize the names of clients and tools. For example: + +* “The OpenSearch *Python* client provides a more natural syntax for interacting with your cluster.” +* “The *Go* client retries requests for a maximum of three times by default.” +* “The *OpenSearch Kubernetes Operator* is an open-source Kubernetes operator that helps automate the deployment and provisioning of OpenSearch and OpenSearch Dashboards in a containerized environment.” +* “You can send events to *Logstash* from many different sources.” + +#### Features + +Features are the individual building blocks of user experiences, reflect the functionality of a product, and are shared across different experiences. For example, the SQL/PPL, reporting, notifications, alerting, and anomaly detection used for observability are the same SQL/PPL, reporting, notifications, alerting, and anomaly detection used for general analytics, security analytics, and search analytics. Components of the user experience such as navigation, credentials management, theming, etc. are also considered to be features. + +Use lowercase when referring to features, unless you are referring to a formally named feature that is specific to OpenSearch. For example: + +* “The Notifications plugin provides a central location for all of your *notifications* from OpenSearch plugins.” +* “*Remote-backed storage* is an experimental feature. Therefore, we do not recommend the use of *remote-backed storage* in a production environment.” +* “You can take and restore *snapshots* using the snapshot API.” +* “You can use the *VisBuilder* visualization type in OpenSearch Dashboards to create data visualizations by using a drag-and-drop gesture” (You can refer to VisBuilder alone or qualify the term with “visualization type”). + +#### Plugin names + +A plugin is a feature or distinct component that extends the functionality of OpenSearch. For now, capitalize plugin names, but use *plugin* sparingly. The concept of plugins will become obsolete once we re-architect the product. For example: + +* “Interaction with the *ML Commons* plugin occurs through either the REST API or [ad](https://opensearch.org/docs/latest/search-plugins/sql/ppl/functions#ad) and [kmeans](https://opensearch.org/docs/latest/search-plugins/sql/ppl/functions#kmeans) Piped Processing Language (PPL) commands.” +* “Use the *Neural Search* plugin to integrate ML language models into your search workloads.” + +### Voice and tone + +Voice is the point of view or style of a writer. Voice can refer to active or passive but may also refer to verb tense (past, present, future, and so on). Tone is the emotional undercurrent (such as calm or angry) of the voice. We strive to speak to the community with a consistent voice and tone, as if a single writer writes all content. Writing with a common voice also helps to establish the OpenSearch Project identity and brand. + +#### Voice + +The voice of the OpenSearch Project is people oriented and focused on empowering the user directly. We use language that emphasizes what the user can do with OpenSearch rather than what tasks OpenSearch can perform. + +Whenever possible, use the active voice instead of the passive voice. The passive form is typically wordier and can often cause writers to obscure the details of the action. For example, change the agentless passive _it is recommended_ to the more direct _we recommend_. + +Refer to the reader as _you_ (second person), and refer to the OpenSearch Project as _we_ (first person). If there are multiple authors for a blog post, you can use _we_ to refer to the authors as individuals. Do not refer to the OpenSearch Project or to the AWS personnel working on the project as a *team*, as this implies differentiation within the community. + +In most cases, try to describe the actions that the user takes rather than contextualizing from the feature perspective. For example, use phrases such as “With this feature, you can...” or “Use this feature to...” instead of saying a feature *allows*, *enables*, or *lets* the user do something. + +For procedures or instructions, ensure that action is taken by the user (“Then you can stop the container...”) rather than the writer (“We also have to stop the container...”). Reserve the first-person plural for speaking as the OpenSearch Project, with recommendations, warnings, or explanations. + +In general, use the present tense. Use the future tense only when an event happens later than, not immediately after, the action under discussion. + +#### Tone + +The tone of the OpenSearch Project is conversational, welcoming, engaging, and open. The overall tone is knowledgeable but humble, informal but authoritative, informative but not dry, and friendly without being overly familiar. + +We talk to readers in their own words, never assuming that they understand how OpenSearch works. We use precise technical terms where appropriate, but we avoid technical jargon and insider lingo. We speak to readers in simple, plain, everyday language. + +Avoid excessive words, such as please. Be courteous but not wordy. Extra detail can often be moved elsewhere. Use humor with caution because it is subjective, can be easily misunderstood, and can potentially alienate your audience. + +### Brand personality traits + +| Personality trait | Description | Guidance | +| :--------- | :------- | :------ | +| **Clear and precise** | The OpenSearch Project understands that our community works, develops, and builds in roles and organizations that require precise thinking and thorough documentation. We strive to use precise language—to clearly say what we mean without leaving ideas open to interpretation, to support our assertions with facts and figures, and to provide credible and current (third-party) references where called for.

We communicate in plain, direct language that is easily understood. Complex concepts are introduced in a concise, unambiguous way. High-level content is supported by links to more in-depth or technical content that users can engage with at their convenience. | - Write with clarity and choose words carefully. Think about the audience and how they might interpret your assertions.
- Be specific. Avoid estimates or general claims when exact data can be provided.
- Support claims with data. If something is “faster” or “more accurate,” say how much.
- When citing third-party references, include direct links. | +| **Transparent and open** | As an open-source project, we exchange information with the community in an accessible and transparent manner. We publish our product plans in the open on GitHub, share relevant and timely information related to the project through our forum and/or our blog, and engage in open dialogues related to product and feature development in the public sphere. Anyone can view our roadmap, raise a question or an issue, or participate in our community meetings. | - Tell a complete story. If you’re walking the reader through a solution or sharing news, don’t skip important information.
- Be forthcoming. Communicate time-sensitive news and information in a thorough and timely manner.
- If there’s something the reader needs to know, say it up front. Don’t “bury the lede.” | +| **Collaborative and supportive** | We’re part of a community that is here to help. We aim to be resourceful on behalf of the community and encourage others to do the same. To facilitate an open exchange of ideas, we provide forums through which the community can ask and answer one another’s questions. | - Use conversational language that welcomes and engages the audience. Have a dialogue.
- Invite discussion and feedback. We have several mechanisms for open discussion, including requests for comment (RFCs), a [community forum](https://forum.opensearch.org/), and [community meetings](https://www.meetup.com/OpenSearch/). +| **Trustworthy and personable** | We stay grounded in the facts and the data. We do not overstate what our products are capable of. We demonstrate our knowledge in a humble but authoritative way and reliably deliver what we promise. We provide mechanisms and support that allow the audience to explore our products for themselves, demonstrating that our actions consistently match our words.

We speak to the community in a friendly, welcoming, judgment-free way so that our audience perceives us as being approachable. Our content is people oriented and focused on empowering the user directly. | - Claims and assertions should be grounded in facts and data and supported accordingly.
- Do not exaggerate or overstate. Let the facts and results speak for themselves.
- Encourage the audience to explore our products for themselves. Offer guidance to help them do so.
- Write directly and conversationally. Have a dialogue with your audience. Imagine writing as if you’re speaking directly to the person for whom you’re creating content.
- Write from the community, for the community. Anyone creating or consuming content about OpenSearch is a member of the same group, with shared interest in learning about and building better search and analytics solutions. | +| **Inclusive and accessible** | As an open-source project, the OpenSearch Project is for everyone, and we are inclusive. We value the diversity of backgrounds and perspectives in the OpenSearch community and welcome feedback from any contributor, regardless of their experience level.

We design and create content so that people with disabilities can perceive, navigate, and interact with it. This ensures that our documentation is available and useful for everyone and helps improve the general usability of content.

We understand our community is international and our writing takes that into account. We use plain language that avoids idioms and metaphors that may not be clear to the broader community. | - Use inclusive language to connect with the diverse and global OpenSearch Project audience.
- Be careful with our word choices.
- Avoid [sensitive terms](https://github.com/opensearch-project/documentation-website/blob/main/STYLE_GUIDE.md#sensitive-terms).
- Don't use [offensive terms](https://github.com/opensearch-project/documentation-website/blob/main/STYLE_GUIDE.md#offensive-terms).
- Don't use ableist or sexist language or language that perpetuates racist structures or stereotypes.
- Links: Use link text that adequately describes the target page. For example, use the title of the target page instead of “here” or “this link.” In most cases, a formal cross-reference (the title of the page you’re linking to) is the preferred style because it provides context and helps readers understand where they’re going when they choose the link.
- Images:
  - Add introductory text that provides sufficient context for each image.
  - Add ALT text that describes the image for screen readers.
- Procedures: Not everyone uses a mouse, so use device-independent verbs; for example, use “choose” instead of “click.”
- Location: When you’re describing the location of something else in your content, such as an image or another section, use words such as “preceding,” “previous,” or “following” instead of “above” and “below.” + +## Style guidelines + +The following guidelines should be observed in OpenSearch Project content. + +### Acronyms + +Spell out acronyms the first time that you use them on a page and follow them with the acronym in parentheses. Use the format `spelled-out term (acronym)`. On subsequent use, use the acronym alone. + +Do not capitalize the spelled-out form of an acronym unless the spelled-out form is a proper noun or the community generally capitalizes it. In all cases, our usage should reflect the community’s usage. + +In general, spell out acronyms once on a page. However, you can spell them out more often for clarity. + +Make an acronym plural by adding an *s* to the end of it. Do not add an apostrophe. + +How an acronym is pronounced determines whether you use the article *an* or *a* before it. If it's pronounced with an initial vowel sound, use *an*. Otherwise, use *a*. + +If the first use of an acronym is in a heading, retain the acronym in the heading, and then write out the term in the following body text, followed by the acronym in parentheses. Don't spell out the term in the heading with the acronym included in parentheses. If the first use of the service name is in a title or heading, use the short form of the name in the heading, and then use the long form followed by the short form in parentheses in the following body text. + +In general, spell out abbreviations that end with *-bit* or *-byte*. Use abbreviations only with numbers in specific measurements. Always include a space between the number and unit. Abbreviations that are well known and don't need to be spelled out are *KB*, *MB*, *GB*, and *TB*. + +Some acronyms are better known than their spelled-out counterparts or might be used almost exclusively. These include industry-standard protocols, markdown and programming languages, and common file formats. You don't need to spell out these acronyms. + +The following table lists acronyms that you don't need to spell out. + +| Acronym | Spelled-out term | +| :--------- | :------- | +| 3D | three-dimensional | +| AI | artificial intelligence | +| API | application programming interface | +| ASCII | American Standard Code for Information Interchange | +| BASIC | Beginner's All-Purpose Symbolic Instruction Code | +| BM25 | Best Match 25 | +| CLI | command-line interface | +| CPU | central processing unit | +| CRUD | create, read, update, and delete | +| CSV | comma-separated values | +| DNS | Domain Name System | +| DOS | disk operating system | +| FAQ | frequently asked questions | +| FTP | File Transfer Protocol | +| GIF | Graphics Interchange Format | +| HTML | hypertext markup language | +| HTTP | hypertext transfer protocol | +| HTTPS | hypertext transfer protocol secure | +| HTTP(s) | Use to refer to both protocols, HTTP and HTTPS. | +| I/O | input/output | +| ID | identifier | +| IP | Internet protocol | +| JPEG | Joint Photographic Experts Group | +| JSON | JavaScript Object Notation | +| k-NN | k-nearest neighbors | +| NAT | network address translation | +| NGINX | engine x | +| PDF | Portable Document Format | +| RAM | random access memory | +| REST | Representational State Transfer | +| RGB | red-green-blue | +| ROM | read-only memory | +| SAML | Security Assertion Markup Language | +| SDK | software development kit | +| SSL | Secure Sockets Layer | +| TCP | Transmission Control Protocol | +| TIFF | Tagged Image File Format | +| TLS | Transport Layer Security | +| UI | user interface | +| URI | uniform resource identifier | +| URL | uniform resource locator | +| UTC | Coordinated Universal Time | +| UTF | Unicode Transformation Format | +| XML | Extensible Markup Language | +| YAML | YAML Ain't Markup Language | + +### Code examples + +Calling out code within a sentence or code block makes it clear to readers which items are code specific. The following is general guidance about using code examples and when to use `code font`: + +* In Markdown, use single backticks (`` ` ``) for inline code formatting and triple backticks (```` ``` ````) for code blocks. For example, writing `` `discovery.type` `` in Markdown will render as `discovery.type`. A line containing three backticks should be included both before and after an example code block. +* In sentences, use code font for things relating to code, for example, “The `from` and `size` parameters are stateless, so the results are based on the latest available data.” +* Use lead-in sentences to clarify the example. Exception: API examples, for which a caption-style lead-in (heading 4) is sufficient. +* Use the phrase *such as* for brief examples within a sentence. +* Use language-specific indentation in code examples. +* Make code blocks as copy-and-paste friendly as possible. Use either the [`copy` or `copy-curl` buttons](https://github.com/opensearch-project/documentation-website/blob/main/FORMATTING_GUIDE.md#buttons). + +#### Code formatting checklist + +The following items should be in `code font`: + +* Field names, variables (including environment variables), and settings (`discovery.type`, `@timestamp`, `PATH`). Use code font for variable and setting values if it improves readability (`false`, `1h`, `5`, or 5). +* Placeholder variables. Use angle brackets for placeholder variables (`docker exec -it /bin/bash`). +* Commands, command-line utilities, and options (`docker container ls -a`, `curl`, `-v`). +* File names, file paths, and directory names (`docker-compose.yml`, `/var/www/simplesamlphp/config/`). +* URLs and URL components (`localhost`, `http://localhost:5601`). +* Index names (`logs-000001`, `.opendistro-ism-config`), endpoints (`_cluster/settings`), and query parameters (`timeout`). +* Language keywords (`if`, `for`, `SELECT`, `AND`, `FROM`). +* Operators and symbols (`/`, `<`, `*`). +* Regular expression, date, or other patterns (`^.*-\d+$`, `yyyy-MM-dd`). +* Class names (`SettingsModule`) and interface names (*`RestHandler`*). Use italics for interface names. +* Text field inputs (Enter the password `admin`). +* Email addresses (`example@example.org`). + +#### Caption-style examples + +If you use a caption-style example, use the heading **Example**, with a colon, as appropriate. The following are caption-style examples: + + **Example: Retrieve a specified document from an index** + + The following example shows a request that retrieves a specific document and its information from an index: + + `GET sample-index1/_doc/1` + + **Example request** + + `GET sample-index1/_doc/1` + +Sometimes, you might not want to break up the flow of the text with a new heading. In these cases, you can use an example with no heading. + + The following command maps ports 9200 and 9600, sets the discovery type to single-node, and requests the newest image of OpenSearch: + + `docker run -d -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:latest` + +#### Lead-in sentences + +When using lead-in sentences, summarize, clarify, or refer to the example that follows. A lead-in sentence is a complete sentence that ends in a colon. + + For example, the following query requests statistics for `docs` and `search`: + + `GET _nodes/stats/indices/docs,search` + +#### Referring to a variable or placeholder + +When introducing a code or command line example that refers to a variable or placeholder in the example, be direct by including the variable or placeholder name in the text. Surround the variable or placeholder name with angle brackets (`<` and `>`), for example, ``. Don't refer to the variable or placeholder by its color or format because these can change. If variable or placeholder texts have a lot in common and there are several for the user to complete, be direct by including a “template” for the input in the replaceable text. + + In the following example, replace `` with your own information: + + `~/workspace/project-name$ eb init --modules ` + +### Formatting and organization + +- Use a colon to introduce example blocks (for example, code and scripts) and most lists. Do not use a colon to introduce tables or images. + +- Use bold text for all UI elements, including pages, panes, and dialog boxes. In all cases, emphasize what the user must do as opposed to talking about the UI element itself. + +- Stacked headings should never appear in our content. Stacked headings are any two consecutive headings without intervening text. Even if it is just an introductory sentence, there should always be text under any heading. + +- Use italics for the titles of books, periodicals, and reference guides. However, do not use italics when the title of a work is also a hyperlink. + +- You can refer to APIs in three ways: + 1. When referring to API names, capitalize all words in the name (example: "Field Capabilities API"). + 2. When referring to API operations by the exact name of the endpoint, use lowercase with code format (example: "`_field_caps` API"). + 3. When describing API operations but not using the exact name of the endpoint, use lowercase (example: "field capabilities API operations" or "field capabilities operations"). + +### Images + +- Add introductory text that provides sufficient context for each image. + +- Add ALT text that describes the image for screen readers. + +- When you’re describing the location of an image, use words such as *preceding*, *previous*, or *following* instead of *above* and *below*. + +- Text that introduces an image should be a complete sentence and end with a period, not a colon. + +### Links + +- **Formal cross-references**: In most cases, a formal cross-reference (the title of the page you're linking to) is the preferred style because it provides context and helps readers understand where they're going when they choose the link. Follow these guidelines for formal cross-references: + - Introduce links with formal introductory text: + - Use "For information *about*" or "For more information *about*." Don't use "For information *on*." + - If you are linking to procedures, you can use either "For instructions *on*" or "instructions *for*." Don't use "instructions *about*." + - Where space is limited (for example, in a table), you can use "*See* [link text]." + - Ensure that the link text matches the section title text.

Example: "To get involved, see [Contributing](https://opensearch.org/source.html) on the OpenSearch website."
+ +- **Embedded links**: Embedded links are woven into a sentence without formal introductory text. They're especially useful in tables or other elements where space is tight. The text around the embedded link must relate to the information in the link so that the reader understands the context. Do not use *here* or *click here* for link text because it creates accessibility problems.

Example: "Finally, [delete the index](https://opensearch.org/docs/latest/api-reference/index-apis/delete-index)." + +### Lists + +The following guidelines apply to all list types: +- Make lists parallel in content and structure. Don’t mix single words with phrases, don’t start some phrases with a noun and others with a verb, and don’t mix verb forms. +- Present the items in alphabetical order if the order of items is arbitrary. +- Capitalize the first letter of the first word of each list item. +- If the list is simple, you don’t need end punctuation for the list items. +- If the list has a mixture of phrases and sentences, punctuate each list item. +- Punctuate each list item with a period if a list item has more than one sentence. +- Punctuate list items consistently. If at least one item in a list requires a period, use a period for all items in that list. +- Introductory sentences are required for lists. +- Introductory sentences should be complete sentences. +- Introductory sentences should end with a colon. +- Don’t use semicolons, commas, or conjunctions (like and or or) at the end of list items. + +### Numbers and measurement + +- Spell out cardinal numbers from 1 to 9. For example, one NAT instance. Use numerals for cardinal numbers 10 and higher. Spell out ordinal numbers: first, second, and so on. In a series that includes numbers 10 or higher, use numerals for all. Use a comma separator for numbers of four digits or more—for example, 1,000. + +- For descriptions that include time ranges, separate the numbers with an en dash. Avoid extra words such as between or from n to n. + - Correct: It can take 5–10 minutes before logs are available. + - Incorrect: It can take between 5 and 10 minutes before logs are available. + +- Use numerals for all measurement-based references, including time. Include a space between the number and the abbreviation for the unit of measure. + - Correct: + - 100 GB + - 1 TB + - 3 minutes + - 12 subnets (8 public and 4 private) + - Incorrect + - One hundred GB + - 1TB + +### Procedures + +A procedure is a series of numbered steps that a user follows to complete a specific task. Users should be able to scan for and recognize procedures easily. Make procedures recognizable by using the following: + +- Predictable content parts +- Parallel language constructions +- Consistent formatting + +Use *example*, not *sample*, to introduce example blocks (for example, code, scripts, and API requests and responses). + +#### Describing interactions with the UI + +Replace pointer-specific verbs with device-agnostic/generic verbs to accommodate readers with disabilities and users of various input methods and devices, including the pointer, keyboard, and touch screens. Don't use device-specific verbs such as _click_ or _swipe_. However, when the generic language makes it difficult to understand the instructions, you can include pointer-specific hints in parentheses. Use your judgment. If you have a question, ask your editor. + +We follow a slightly modified version of the _Microsoft Writing Style Guide_ guidance on describing interactions with a UI, provided here. + +| Verb | Use for | Examples | +| :--------- | :------- | :------- | +| **Open** | - Apps and programs
- Files and folders
- Shortcut menus
Use for websites and webpages only when necessary to match the UI. Otherwise, use _go to_.
- Don't use for commands and menus. | - Open Photos.
- Open the Reader app.
- Open the Filename file.
- To open the document in Outline view, select **View** > **Outline**.
- In WindowName, open the shortcut menu for ItemName. | +| **Close** | - Apps and programs
- Dialog boxes
- Files and folders
- Notifications and alerts
- Tabs
- The action a program or app takes when it encounters a problem and can't continue. (Don't confuse with _stop responding_). | - Close the Alarms app.
- Close Excel.
- Save and close the document.
- Closing Excel also closes all open worksheets. | +| **Leave** | Websites and webpages | Select **Submit** to complete the survey and leave this page. | +| **Go to** | - Opening a menu.
- Going to a tab or another particular place in the UI.
- Going to a website or webpage.
- It's ok to use _On the **XXX** tab_ if the instruction is brief and continues immediately. | - Go to Search, enter the word **settings**, and then select **Settings**.
- Go to **File**, and then select **Close**.
- On the ribbon, go to the **Design** tab.
- Go to the **Deploy** tab. in the **Configuration** list ...
- On the **Deploy** tab, in the **Configuration** list ...
- Go to Example.com to register. | +| **Select** | Instructing the user to select a specific item, including:
- Selecting an option, such as a button.
- Selecting a checkbox.
- Selecting a value from a list box.
- Selecting link text to go to a link.
- Selecting an item on a menu or shortcut menu.
- Selecting an item from a gallery. | - Select the **Modify** button.
- For **Alignment**, select **Left**.
- Select the text, open the shortcut menu, and then select **Font**.
- Select **Open in new tab**.
- Select the **LinkName** link. | +| **Select and hold, select and hold (or right-click)** | Use to describe pressing and holding an element in the UI. It's OK to use _right-click_ with _select and hold_ when the instruction isn't specific to touch devices. | - To flag a message that you want to deal with later, select and hold it, and then select **Set flag**.
- Select and hold (or right-click) the Windows taskbar, and then select **Cascade windows**.
- Select and hold (or right-click) the **Start** button, and then select **Device Manager**. | +| **>** | Use a greater-than symbol (>) to separate sequential steps.
Only use this approach when there's a clear and obvious path through the UI and the selection method is the same for each step. For example, don't mix things that require opening, selecting, and choosing.
Don't bold the greater-than symbol. Include a space before and after the symbol. | Select **Accounts** > **Other accounts** > **Add an account**. | +| **Clear** | Clearing the selection from a checkbox. | Clear the **Header row** checkbox. | +| **Choose** | Choosing an option, based on the customer's preference or desired outcome. | On the **Font** tab, choose the effects you want. | +| **Switch, turn on, turn off** | Turning a toggle key or toggle switch on or off. | - Use the **Caps lock** key to switch from typing capital letter to typing lowercase letters.
- To keep all applied filters, turn on the **Pass all filters** toggle. | +| **Enter** | Instructing the customer to type or otherwise insert a value, or to type or select a value in a combo box. | - In the search box, enter...
- In the **Tab stop position** box, enter the location where you want to set the new tab.
- In the **Deployment script name** box, enter a name for this script. | +| **Move, drag** | Moving anything from one place to another by dragging, cutting and pasting, or another method. Use for tiles and any open window (including apps, dialog boxes, and files).
Use _move through_ to describe moving around on a page, moving through screens or pages in an app, or moving up, down, right, and left in a UI. | - Drag the Filename file to the Foldername folder.
- Move the tile to the new section.
- Drag the Snipping Tool out of the way, if necessary, and then select the area you want to capture.
- If the **Apply Styles** task pane is in your way, just move it. | +| **Press** | Use _press_ to describe single key or key combination entries that users would perform on a keyboard, such as keyboard shortcuts. | - Press **F5**.
- Press **Shift+Enter**.
- Press **Ctrl+Alt+Delete**. | +| **Zoom, zoom in, zoom out** | Use _zoom_, _zoom in_, and _zoom out_ to refer to changing the magnification of the screen or window. | - Zoom in to see more details on the map.
- Zoom out to see a larger geographic area on the map.
- Zoom in or out to see more or less detail. | + +### Punctuation and capitalization + +- Use only one space after a period. + +- Use contractions carefully for a more casual tone. Use common contractions. Avoid future tense (I’ll), archaic (‘twas), colloquial (ain’t), or compound (couldn’t’ve) contractions. + +- Use sentence case for titles, headings, and table headers. Titles of standalone documents may use title case. + +- Use lowercase for nouns and noun phrases that are not proper nouns; for example, *big data*. This style follows the standard rules of American English grammar. + +- For plural forms of nouns that end in “s”, form the possessive case by adding only an apostrophe. + +- When a colon introduces a list of words, a phrase, or other sentence fragment, the first word following the colon is lowercased unless it is a proper name. When a colon introduces one or more complete sentences, the first word following it is capitalized. When text introduces a table or image, it should be a complete sentence and end with a period, not a colon. + +- Use commas to separate the following: + - Independent clauses separated by coordinating conjunctions (but, or, yet, for, and, nor, so). + - Introductory clauses, phrases, words that precede the main clause. + - Words, clauses, and phrases listed in a series. Also known as the Oxford comma. + +- An em dash (—) is the width of an uppercase M. Do not include spacing on either side. Use an em dash to set off parenthetical phrases within a sentence or set off phrases or clauses at the end of a sentence for restatement or emphasis. + +- An en dash (–) is the width of an uppercase N. In ranges, do not include spacing on either side. Use an en dash to indicate ranges in values and dates, separate a bullet heading from the following text in a list, or separate an open compound adjective (two compounds, only one of which is hyphenated) from the word that it modifies. + +- Words with prefixes are normally closed (no hyphen), whether they are nouns, verbs, adjectives, or adverbs. Note that some industry terms don’t follow this hyphenation guidance. For example, *Command Line Interface* and *high performance computing* aren’t hyphenated, and *machine learning* isn’t hyphenated when used as an adjective. Other terms are hyphenated to improve readability. Examples include *non-production*, *post-migration*, and *pre-migration*. + +- In general, comparative or superlative modifiers with “more,” “most,” “less,” or “least” don’t require hyphens. Use one only if it’s needed to avoid ambiguity. + +- The ampersand (&) should never be used in a sentence as a replacement for the word and. An exception to this is in acronyms where the ampersand is commonly used, such as in Operations & Maintenance (O&M). + +- When using a forward slash between words, do not insert space on either side of the slash. For example, *AI/ML* is correct whereas *AI / ML* is incorrect. + +- When referring to API parameters, capitalize *Boolean*. Otherwise, primitive Java data types (*byte*, *short*, *int*, *long*, *float*, *double*, and *char*) start with a lowercase letter, while non-primitive types start with an uppercase letter. + +### Topic titles + +Here are two styles you can use for topic titles: + +* *Present participle phrase* + *noun-based phrase* or *present participle phrase* + *preposition* + *noun-based phrase*, used most often for concept or task topics. For example: + * Configuring security + * Visualizing your data + * Running queries in the console + +* *Noun-based phrase*, used most often for reference topics. For example: + * REST API reference + * OpenSearch CLI + * Field types + * Security analytics + +Use *example*, not *sample*, in headings that introduce example blocks (for example, code, scripts, and API requests and responses). + +## UI text + +Consistent, succinct, and clear text is a critical component of a good UI. We help our users complete their tasks by providing simple instructions that follow a logical flow. + +### UI best practices + +* Follow the OpenSearch Project [naming conventions, voice, tone, and brand personality traits](#naming-conventions-voice-tone-and-brand-personality-traits) guidelines. +* Be consistent with other elements on the page and on the rest of the site. +* Use sentence case in the UI, except for product names and other proper nouns. + +### UI voice and tone + +Our UI text is people oriented and focused on empowering the user directly. We use language that is conversational, welcoming, engaging, and open and that emphasizes what the user can do with OpenSearch rather than what tasks OpenSearch can perform. The overall tone is knowledgeable but humble, informal but authoritative, informative but not dry, and friendly without being overly familiar. + +We talk to readers in their own words, never assuming that they understand how OpenSearch works. We use precise technical terms where appropriate, but we avoid technical jargon and insider lingo. We speak to readers in simple, plain, everyday language. + +For more information, see [Voice and tone](#voice-and-tone) and [Brand personality traits](#brand-personality-traits). + +### Writing guidelines + +UI text is a critical component of a user interface. We help users complete tasks by explaining concepts and providing simple instructions that follow a logical flow. We strive to use language that is consistent, succinct, and clear. + +#### What's the purpose of UI text? + +UI text includes all words, phrases, and sentences on a screen, and it has the following purposes: + +* Describes a concept or defines a term +* Explains how to complete a task +* Describes the purpose of a page, section, table, graph, or dialog box +* Walks users through tutorials and first-run experiences +* Provides context and explanation for individual UI elements that might be unfamiliar to users +* Helps users make a choice or decide if settings are relevant or required for their particular deployment scenario or environment +* Explains an alert or error + +#### Basic guidelines + +Follow these basic guidelines when writing UI text. + +##### Style + +* Keep it short. Users don’t want to read dense text. Remember that UI text can expand by 30% when it’s translated into other languages. +* Keep it simple. Try to use simple sentences (one subject, one verb, one main clause and idea) rather than compound or complex sentences. +* Prefer active voice over passive voice. For example, "You can attach up to 10 policies" is active voice, and "Up to 10 policies can be attached" is passive voice. +* Use device-agnostic language rather than mouse-specific language. For example, use _choose_ instead of _click_ (exception: use _select_ for checkboxes). + +##### Tone + +* Use a tone that is knowledgeable but humble, informal but authoritative, informative but not dry, and friendly without being overly familiar. +* Use everyday language that most users will understand. +* Use second person (you, your) when you address the user. +* Use _we_ if you need to refer to the OpenSearch Project as an organization; for example, "We recommend…." + +##### Mechanics + +* Use sentence case for all UI text. (Capitalize only the first word in a sentence or phrase as well as any proper nouns, such as service names. All other words are lowercase.) +* Use parallel construction (use phrases and sentences that are grammatically similar). For example, items in a list should start with either all verbs or all nouns. + + **Correct** + + Snapshots have two main uses: + * Recovering from failure + * Migrating from one cluster to another + + **Incorrect** + + Snapshots have two main uses: + * Failure recovery + * Migrating from one cluster to another + +* Use the serial (Oxford) comma. For example, “issues, bug fixes, and features”, not “issues, bug fixes and features”. +* Don’t use the ampersand (&). +* Avoid Latinisms, such as _e.g._, _i.e._, or _etc._ Instead of _e.g._, use _for example_ or _such as_. Instead of _i.e._, use _that is_ or _specifically_. Generally speaking, _etc._ and its equivalents (such as _and more_ or _and so on_) aren’t necessary. + +## Special considerations for blog posts + +Blog posts provide an informal approach to educating or inspiring readers through the personal perspective of the authors. Brief posts generally accompany service or feature releases, and longer posts may note best practices or provide creative solutions. Each post must provide a clear community benefit. + +To enhance the strengths of the blogging platform, follow these post guidelines: + +**Be conversational and informal.** + +Posts tend to be more personable, unlike technical documentation. Ask questions, include relevant anecdotes, add recommendations, and generally try to make the post as approachable as possible. However, be careful of slang, jargon, and phrases that a global audience might not understand. + +**Keep it short.** + +Deep topics don’t necessarily require long posts. Shorter, more focused posts are easier for readers to digest. Consider breaking a long post into a series, which can also encourage repeat visitors to the blog channel. + +**Avoid redundancy.** + +Posts should add to the conversation. Instead of repeating content that is already available elsewhere, link to detail pages and technical documentation. Keep only the information that is specific to the post solution or recommendations. + +**Connect with other content.** + +All posts should contain one or more calls to action that give readers the opportunity to create resources, learn more about services or features, or connect with other community members. Posts should also include metadata tags such as services, solutions, or learning levels to help readers navigate to related content. + +## Inclusive content + +When developing OpenSearch Project documentation, we strive to create content that is inclusive and free of bias. We use inclusive language to connect with the diverse and global OpenSearch Project audience, and we are careful in our word choices. Inclusive and bias-free content improves clarity and accessibility of our content for all audiences, so we avoid ableist and sexist language and language that perpetuates racist structures or stereotypes. In practical terms, this means that we do not allow certain terms to appear in our content, and we avoid using others, *depending on the context*. + +Our philosophy is that we positively impact users and our industry as we proactively reduce our use of terms that are problematic in some contexts. Instead, we use more technically precise language and terms that are inclusive of all audiences. + +### Offensive terms + +The following terms may be associated with unconscious racial bias, violence, or politically sensitive topics and should not appear in OpenSearch Project content, if possible. Note that many of these terms are still present but on a path to not being supported. For example, `slave` was removed from the Python programming language in 2018, and the open-source community continues to work toward replacing these terms. + +| Don’t use | Guidance/Use instead | +|----------------|-----------------------------| +| abort | Don't use because it has unpleasant associations and is unnecessarily harsh sounding. Use *stop*, *end*, or *cancel* instead. | +| black day | blocked day | +| blacklist | deny list | +| kill | Don't use. Replace with *stop*, *end*, *clear*, *remove*, or *cancel*.

Exception: *Kill* is unavoidable when referring to Linux kill commands. | +| master | primary, main, leader | +| master account | management account | +| slave | replica, secondary, standby | +| white day | open day | +| whitelist | allow list | + +### Sensitive terms + +The following terms may be problematic *in some contexts*. This doesn’t mean that you can’t use these terms—just be mindful of their potential associations when using them, and avoid using them to refer to people. + +| Avoid using | Guidance/Use instead | +|--------------------------|-------------------------------------| +| blackout | service outage, blocked | +| demilitarized zone (DMZ) | perimeter network, perimeter zone | + +## Trademark policy + +The “OpenSearch” word mark should be used in its exact form and not abbreviated or combined with any other word or words (e.g., “OpenSearch” software rather than “OPNSRCH” or “OpenSearch-ified”). See the [OpenSearch Trademark Policy](https://opensearch.org/trademark-usage.html) for more information. Also refer to the policy and to the [OpenSearch Brand Guidelines](https://opensearch.org/brand.html) for guidance regarding the use of the OpenSearch logo. When using another party’s logo, refer to that party’s trademark guidelines. + diff --git a/TERMS.md b/TERMS.md new file mode 100644 index 00000000..8fc1ba01 --- /dev/null +++ b/TERMS.md @@ -0,0 +1,807 @@ +# OpenSearch terms + +This is how we use our terms, but we’re always open to hearing your suggestions. + +## A + +**abort** + +Do not use because it has unpleasant associations and is unnecessarily harsh sounding. Use *stop*, *end*, or *cancel* instead. + +**above** + +Use only for physical space or screen descriptions, for example, "the outlet above the floor" or "the button above the bar pane." + +For orientation within a document use *previous*, *preceding*, or *earlier*. + +**ad hoc** + +Avoid. Use *one-time* instead. + +**affect** + +Affect as a noun refers to emotion as expressed in face or body language. Affect as a verb means to influence. Do not confuse with effect. + +**AI** + +No need to define as _artificial intelligence (AI)_. + +**AI/ML** + +On first mention, use artificial intelligence and machine learning (AI/ML). + +**Alerting** + +A plugin that notifies you when data from one or more OpenSearch indexes meets certain conditions. + +**allow** + +Use allow when the user must have security permissions in order to complete the task. + +Avoid using allow to refer to making something possible for the user. Instead, rewrite to focus on what’s important from the user’s point of view. + +**allow list** + +Use to describe a list of items that are allowed (not blocked). Do not use as a verb. Do not use whitelist. + +**Amazon OpenSearch Service** + +Amazon OpenSearch Service is a managed service that makes it easy to deploy, operate, and scale OpenSearch clusters in the AWS Cloud. Amazon OpenSearch Service is the successor to Amazon Elasticsearch Service (Amazon ES) and supports OpenSearch and legacy Elasticsearch OSS (up to 7.10, the final open-source version of the software). + +**Anomaly Detection** + +A plugin that automatically detects anomalies in your OpenSearch data in near real time. + +**API operation** + +Use instead of action, method, or function. + +OpenSearch style: + +- Use the CopySnapshot operation to... +- The following API operations… + +Not OpenSearch style + +- Use the CopySnapshot action to... +- Use the CopySnapshot method to... +- Use the CopySnapshot function to... + +**app or application** + +Use app for mobile software, application for all other uses. + +**appear, display, and open** + +Messages and pop-up boxes appear. Windows, pages, and applications open. The verb display requires a definite object. For example: The system displays the error message. + +**application server** + +Do not abbreviate as app server. + +**as well as** + +Avoid. Replace with in addition to or and as appropriate. + +**Asynchronous Search** + +A plugin that lets the user send search requests in the background so that the results can be used later. + +**auto scaling** + +Lower case scaling, auto scaling, and automatic scaling (but not autoscaling) are the preferred descriptive terms when generically describing auto scaling functionality. + +Do not use hyphenated auto-scaling as a compound modifier. Instead, use scaling (for example, scaling policy), or scalable (for example, scalable target or scalable, load-balanced environment). + +**AWS Signature Version 4** + +Use on first appearance. On subsequent appearances, *Signature Version 4* may be used. Only use *SigV4* when space is limited. + +## B + +**below** + +Use only for physical space or screen descriptions, such as “the outlet below the vent,” or “the button below the bar pane.” + +For orientation within a document, use *following* or *later*. + +**big data** + +**black day** + +Do not use. Use *blocked day* instead. + +**blacklist** + +Do not use. Use *deny list* instead. + +**blackout** + +Avoid using. Use *service outage* or *blocked* instead. + +**BM25** + +A ranking function used to estimate the relevance of documents to a given search query. BM25 extends [TF–IDF](#t) by normalizing document length. + +**Boolean** + +Avoid using the name of a Boolean value at the beginning of a sentence or sentence fragment. In general, capitalize the word Boolean. For specific programming languages, follow the usage in that language. + +OpenSearch style: + +- You can use the Boolean functions with Boolean expressions or integer expressions. +- IsTruncated(): A Boolean value that specifies whether the resolved target list is truncated. + +**bottom** + +Use only as a general screen reference, such as “scroll to the bottom of the page.” Don’t use for window, page, or pane references to features or controls. Rather, use *lower* instead. For example, you can use the following wording: “Choose the button on the lower left.” + +**browse** + +Use when referring to scanning information or browsing the web. Don’t use when describing how to navigate to a particular item on our site or a computer. Instead, use *see* or *navigate to*. + +**build (n., v.)** + +Use as a verb to refer to compiling and linking code. Use as a noun only to refer to a compiled version of a program (for example, *Use the current build of Amazon Linux 2*...) in a programming reference. + +## C + +**CA** + +certificate authority + +**certs, certificates** + +Use _certificates_ on first mention. It’s OK to use _certs_ thereafter. + +**checkbox, checkboxes** + +**CI/CD** + +Use _continuous integration_ and _continuous delivery (CI/CD)_ or _continuous integration and delivery (CI/CD)_ on first mention. + +**CLI** + +No need to define as _command-line interface (CLI)_. + +**cluster** + +A collection of one or more nodes. + +**cluster manager** + +A single node that routes requests for the cluster and makes changes to other nodes. Each cluster contains a single cluster manager. + +**command line, command-line** + +Two words as a noun. Hyphenate as an adjective. + +**console** + +A tool inside OpenSearch Dashboards used to interact with the OpenSearch REST API. + +**Cross-Cluster Replication** + +A plugin that replicates indexes, mappings, and metadata from one OpenSearch cluster to another. Follows an active-passive model where the follower index pulls data from a leader index. + +**cyber** + +Except when dictated by open standards, use as a prefix in a closed compound: don’t use spaces or hyphens between _cyber_ and the rest of the word. + +## D + +**data** + +Use data is, not data are. Don’t use datas. Use pieces of data or equivalent to describe individual items within a set of data. + +**data center** + +**dataset** + +**data source** + +**data store, datastore** + +Two words when used generically, but one word when referring to the VMware product. + +**data type** + +**dates** + +Use one of the following date formats: + +- When a human-readable date format is preferred, spell out the date using the Month D, YYYY format (for example, _October 1, 2022_). Do not use an ordinal number for the day (use _1_, not _1st_). If the context is clear, you can omit the year on subsequent mention. If the specific day isn’t known, use the Month YYYY format (for example, _October 2022_). +- When a numeric, lexicographically sortable date is required, use the YYYY-MM-DD format (for example, _2022-10-01_). Make sure to add a zero (0) in front of a single-digit month and day. This is the ISO 8601 standard date format. Make sure also that you use a hyphen (-) and avoid omitting the year. Doing so avoids the ambiguity that’s caused by the common, locally used formats of MM/DD and DD/MM. + +**demilitarized zone (DMZ)** + +Avoid using. Use *perimeter network* or *perimeter zone* instead. + +**deny list** + +Use to describe a list of items that aren’t allowed (blocked). Do not use _blacklist_. + +**disable** + +Use *disable* to describe making a feature or command unavailable. For example: + +- Clear the checkbox to disable automatic monitoring. +- The feature is disabled by default. + +Note that alternatives to *disable*—such as *deactivate*, *turn off*, or *stop*—are acceptable usage where appropriate and may be found in existing documentation. In all cases, use language that corresponds to the language used in the UI, if applicable. + +Do not use *disable* to refer to users. + +**double-click** + +Always hyphenated. Don’t use _double click_. + +**dropdown list** + +**due to** + +Don’t use. Use _because of_ instead. + +## E + +**easy, easier, easily** + +Avoid the use of *easy*, *easier*, or *easily* if possible when describing or comparing an OpenSearch Project product, feature, or procedure in technical content. Use of these terms is audience dependent. These terms are potentially misleading or inaccurate and might be perceived as condescending by some technical users. Instead, describe what the user can do. + +On documentation landing pages, it’s acceptable to use *easy*, *easier*, or *easily* within the service description only. + +**effect** + +_Effect_ as a noun refers to something that’s caused by something else. _Effect_ as a verb means to bring about. Do not confuse with _affect_. + +**e.g.** + +Avoid. Use _for example_ or _such as_ instead. + +**Elastic IP address** + +**email** + +Use as a singular noun or adjective to refer to the collective concept, and use _message_ or _mail_ for individual items. Use _send email_ as the verb form. Don’t use the plural form because it’s a collective noun. + +**enable** + +Use *enable* to describe making a feature or command available. For example: + +- Select the checkbox to enable automatic monitoring. +- The feature is enabled by default. + +Note that alternatives to *enable*—such as *activate*, *turn on*, or *start*—are acceptable usage where appropriate and may be found in existing documentation. In all cases, use language that corresponds to the language used in the UI, if applicable. + +Avoid using *enable* to refer to making something possible for the user. Instead, rewrite to focus on what's important from the user's point of view. For example, “With ABC, you can do XYZ” is a stronger statement than “ABC enables you to XYZ.” Additionally, using a task-based statement is usually more clear than the vague “…enables you to….” + +**enter** + +In general, use in preference to _type_ when a user adds text or other input (such as numbers or symbols). + +**etc., et cetera** + +Do not use. + +Generally speaking, etc. and its equivalents (such as and more or and so on) aren’t necessary. + +**execute** + +Replace with a more specific verb. In the sense of carrying out an action, use *run*, *process*, or *apply*. In the sense of initiating an operation, use *start*, *launch*, or *initiate*. + +Exception: *Execution* is unavoidable for third-party terms for which no alternative was determined, such as SQL execution plans. *Executable* is also unavoidable. + +## F + +**fail over (v.), failover (n.)** + +**Faiss** + +**file name** + +**frontend (n., adj.)** + +Use frontend as an adjective and a noun. Do not use front end or front-end. Do not make frontend possessive except as part of a compound noun, such as frontend system. + +## G + +**generative AI** + +Do not use _GenAI_, _Gen AI_, _gen AI_, or _genAI_. To avoid the overuse of *generative AI*, *AI/ML-powered applications* may also be used. + +**geodistance** + +**geohash** + +**geohex** + +**geopoint** + +**geopolygon** + +**geoshape** + +**geospatial** + +**geotile** + +## H + +**hang** + +Do not use. This term is unnecessarily violent for technical documentation. Use *stop responding* instead. + +**hardcode** + +**hard disk drive (HDD)** + +**high availability (HA)** + +**high performance computing (HPC)** + +**hostname** + +**Hugging Face** + +## I + +**i.e.** + +Do not use. Use _that_ is or _specifically_ instead. + +**if, whether** + +Do not use *if* to mean *whether*. It is best to use *whether* in reference to a choice or alternatives ("we're going whether it rains or not") and *if* when establishing a condition ("we will go if it doesn't rain"). + +**in, on** + +Use _in Windows_ or _in Linux_ in reference to components of the OS or work in the OS. Use on Windows in reference to Windows applications. Examples: + +- Use the Devices and Printers Control Panel in Windows to install a new printer. +- In Windows, run the setup command. +- Select an application that runs on Windows. + +Run applications and instances _in the cloud_, but extend services to the cloud. + +Use *on the forum*. Whatever is on the internet (the various websites, etc.), you are *on* because you cannot be *in* it. + +**index, indexes** + +In technical documentation and the UI, use *indexes* as the plural form of *index*. Use *indices* only in the context of mathematical expressions. Variable and setting names should not be changed. + +In blog posts, use the plural *indexes* unless there is a domain-specific reason (for example, a mathematical or financial context) to use *indices*. + +**Index Management (IM)** + +**Index State Management (ISM)** + +**ingest pipeline** + +Not _ingestion pipeline_. + +**inline** + +**install in, on** + +install in a folder, directory, or path; install on a disk, drive, or instance. + +**internet** + +Do not capitalize. + +**invalid** + +Avoid using. Use *not valid* instead. + +**IP address** + +Don’t abbreviate as _IP only_. + +## J + +**just** + +Use *just* in the sense of *just now* (as in "the resources that you just created"). Otherwise, use *only* in all other contexts (to mean "limited to; nothing more than"). + +## K + +**keystore** + +**key-value** + +Not _key/value_. + +**kill** + +Do not use. Replace with *stop*, *end*, *clear*, *remove*, or *cancel*. + +Exception: *Kill* is unavoidable when referring to Linux kill commands. + +**k-means** + +A simple and popular unsupervised clustering ML algorithm built on top of Tribuo library that chooses random centroids and calculates iteratively to optimize the position of the centroids until each observation belongs to the cluster with the nearest mean. + +**k-NN** + +Short for _k-nearest neighbors_, the k-NN plugin enables users to search for the k-nearest neighbors to a query point across an index of vectors. No need to define. + +## L + +**launch, start** + +You _start_ an application but _launch_ an instance, environment, or cluster. + +**let** + +Avoid using _let_ to refer to making something in a service or feature possible for the user. Instead, rewrite to focus on what’s important from the user’s point of view. + +**leverage** + +Replace with _use_. + +**lifecycle** + +One word in reference to software. + +**like (prep.)** + +OK to use to call out something for comparison. + +As a general rule, if you can replace like with similar to, it’s OK to use like. But, if you can replace _like_ with _such as_, use _such as_. + +**LLM** + +Define on first appearance as _large language model (LLM)_. + +**locate in, on** + +Located _in_ (a folder, directory, path), located on a disk drive or instance. + +**log in (v.), login (adj., n.)** + +Use with technologies with interfaces that use this verb. Also note that you log in to an instance, not log into. Also use log out and logout. + +**Logstash** + +A light-weight, open-source, server-side data processing pipeline that allows you to collect data from a variety of sources, transform it on the fly, and send it to your desired destination. + +**lower left, lower right** + +Hyphenate as adjectives. Use instead of *bottom left* and *bottom right*, unless the field name uses *bottom*. For example, "The lower-right corner." + +**LTS** + +Long-Term Support + +**Lucene** + +Apache Lucene™ is a high-performance, full-featured search engine library written entirely in Java. OpenSearch uses a modified version of Lucene as the basis for search operations within OpenSearch. + +## M + +**machine learning** + +When *machine learning* is used multiple times in a document, use *machine learning (ML)* on first mention and *ML* thereafter. There is no need to redefine *ML* when *AI/ML* has already been defined. If spelled out, write *machine learning* as two words (no hyphen) in all cases, including when used as an adjective before a noun. + +**Machine Learning (ML) Commons** + +A new plugin that makes it easy to develop new ML features. It allows engineers to leverage existing open-source ML algorithms and reduce the efforts to build them from scratch. + +**master** + +Do not use. Use *primary*, *main*, or *leader* instead. + +**master account** + +Do not use. Use *management account* instead. + +**may** + +Avoid. Use _can_ or _might_ instead. + +**multilayer, multilayered** + +**must, shall, should** + +_Must_ and _shall_ refer to requirements. If the reader doesn’t follow the instruction, something won’t work right. + +_Should_ is used with recommendations. If the reader doesn’t follow the instruction, it might be harder or slower, but it’ll work. + +## N + +**navigate to** + +Not navigate _in_. + +**near real time (n.), near real-time (adj.) (NRT)** + +Use _near real time_ as a noun; use near real-time as an adjective. Don’t add a hyphen between _near_ and _real time_ or _real-time_. + +Spell out _near real time_ on first mention; _NRT_ can be used on subsequent mentions. + +**node** + +A server that stores your data and processes search requests with OpenSearch, usually as part of a cluster. Do not use _master node_ and avoid using _worker node_. + +**non-production** + +Hyphenate to make the term easier to scan and read. + +## O + +**onsite** + +**OpenSearch** + +OpenSearch is a community-driven, open-source search and analytics suite derived from Apache 2.0 licensed Elasticsearch 7.10.2 and Kibana 7.10.2. It consists of a search engine daemon, OpenSearch, and a visualization and user interface, OpenSearch Dashboards. + +**OpenSearch Dashboards** + +The default visualization tool for data in OpenSearch. On first appearance, use the full name. *Dashboards* may be used on subsequent appearances. + +open source (n.), open-source (adj.) + +Use _open source_ as a noun (for example, “The code used throughout this tutorial is open source and can be freely modified”). Use _open-source_ as an adjective _(open-source software)_. + +**OpenSearch Playground** + +Do not precede with _the_. OpenSearch Playground provides a central location for existing and evaluating users to explore features in OpenSearch and OpenSearch Dashboards without downloading or installing any OpenSearch components locally. + +**operating system** + +When referencing operating systems in documentation, follow these guidelines: + +- In general, if your docs or procedures apply to both Linux and macOS, you can also include Unix. +- Unix and UNIX aren’t the same. UNIX is a trademarked name that’s owned by The Open Group. In most cases, you should use Unix. +- When referring to the Mac operating system, use macOS. Don’t say Mac, Mac OS, or OS X. +- When referring to Windows, it’s not necessary to prefix with Microsoft. +- If you need to reference multiple Unix-like operating systems, you should separate by commas and use the following order: Linux, macOS, or Unix. + +**or earlier, or later** + +OK to use with software versions. + +## P + +**Painless** + +The default scripting language for OpenSearch, either used inline or stored for repeat use. Similar to Java’s language specification. + +**per** + +- Do not use to mean _according to_ (for example, per the agreement). +- OK to use in meaning of _to_, _in_, _for_, or _by each_ (one per account) where space is limited and in set terms and phrases, such as any of the following: + - queries per second (QPS) + - bits per second (bps) + - megabytes per second (MBps) +- Consider writing around _per_ elsewhere. _Per_ can sound stuffy and confusing to some global users. + +**percent** + +Spell out in blog posts (for example, _30 percent_). + +Use % in headlines, quotations, and tables or in technical copy. + +**Performance Analyzer** + +An agent and REST API that allows you to query numerous performance metrics for your cluster, including aggregations of those metrics, independent of the Java Virtual Machine (JVM). + +**please** + +Avoid using except in quoted text. + +**plugin** + +Tools inside of OpenSearch that can be customized to enhance OpenSearch’s functionality. For a list of core plugins, see the [OpenSearch plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) page. Capitalize if it appears as part of the product name in the UI. + +**pop-up** + +**premise, premises** + +With reference to property and buildings, always form as plural. + +Correct: an on-premises solution + +Incorrect: an on-premise solution, an on-prem solution + +**pretrain** + +**primary shard** + +A Lucene instance that contains data for some or all of an index. + +**purge** + +Use only in reference to specific programming methods. Otherwise, use *delete*, *clear*, or *remove* instead. + +## Q + +**query** + +A call used to request information about your data. + +## R + +**real time (n.) real-time (adj.)** + +Use with caution; this term can imply a degree of responsiveness or speed that may not be true. When needed, use _real time_ as a noun (for example “The request is sent in real time”). Use _real-time_ as an adjective (“A real-time feed is displayed...”). + +**recall** + +The quantity of documents returned from a query. + +**replica shard** + +Copy of a primary shard. Helps improve performance when using indexes across multiple nodes. + +**repo** + +Use as a synonym for repository, on second and subsequent use. + +**RPM Package Manager (RPM)** + +Formerly known as RedHat Package Manager. An open-source package management system for use with Linux distributions. + +**rule** + +A set of conditions, internals, and actions that create notifications. + +## S + +**screenshot** + +**segregate** + +Avoid using. Use *separate* or *isolate* instead. + +**setting** + +A key-value pair that creates a mapping in one of the many YAML configuration files used throughout OpenSearch. Sometimes alternatively called parameters, the programming language manipulating the key-value pair usually dictates the name of this mapping in a YAML file. For OpenSearch documentation (Java), they are properly a `Setting` object. + +The following examples of settings illustrate key-value pairs with a colon separating the two elements: + +`Settings.index.number_of_shards: 4` + +`plugins.security.audit.enable_rest: true` + +**set up (v.), setup (n., adj.)** + +Use _set up_ as a verb (“To set up a new user...”). Use _setup_ as a noun or adjective (“To begin setup...”). + +**shard** + +A piece of an index that consumes CPU and memory. Operates as a full Lucene index. + +**simple, simply** + +Don't use. Both *simple* and *simply* are not neutral in tone and might sound condescending to some users. If you mean *only*, use *only* instead. + +**since** + +Use only to describe time events. Don’t use in place of because. + +**slave** + +Do not use. Use *replica*, *secondary*, or *standby* instead. + +**Snapshot Management (SM)** + +**solid state drive (SSD)** + +**standalone** + +**start, launch** + +You _start_ an application but _launch_ an instance, environment, or cluster. + +**startup (n.), start up (v.)** + +Never hyphenated. Use _startup_ as a noun (for example, “The following startup procedure guides you through...”). Use _start up_ as a verb (“You can start up the instances by...”). + +**Stochastic Gradient Descent (SGD)** + +## T + +**term frequency–inverse document frequency (TF–IDF)** + +A numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. + +**time out (verb), timeout (noun, adjective)** + +Never hyphenate. Use _time out_ as a verb (“The request will time out if the server doesn’t respond”). Use _timeout_ as a noun or adjective (“You can set the timeout interval by entering a number into...”). + +**time frame** + +**time-series data** + +Data that's provided as part of a metric. The time value is assumed to be when the value occurred. + +**timestamp** + +**time zone** + +**trade-off** + +**trigger** + +Avoid using as a verb to refer to an action that precipitates a subsequent action. It is OK to use when referring to a feature name, such as a *trigger function* or *time-triggered architecture*. As a verb, use an alternative, such as *initiate*, *invoke*, *launch*, or *start*. + +**truststore** + +**turn on, turn off** + +Use *turn on* and *turn off* in reference to a toggle to describe switching a setting or mode on or off. + +Don't use *choose*, *select*, *clear*, *slide*, *enable*, or *disable* for a toggle. + +For making a feature available or unavailable, use *enable*. + +## U + +**UltraWarm** + +A storage tier that you can use to store and analyze your data with Elasticsearch and Kibana that is optimized for performance. To learn more about the service, see the introductory [blog post](https://aws.amazon.com/about-aws/whats-new/2020/05/aws-announces-amazon-elasticsearch-service-ultrawarm-general-availability/). + +**upper left, upper right** + +Hyphenate as adjectives. Use instead of *top left* and *top right*, unless the field name uses *top*. For example, "The upper-right corner." + +**US** + +No periods, as specified in the Chicago Manual of Style. + +**user** + +In most cases, replace with the more direct form you. Reserve _user_ for cases where you are referring to a third party (not the audience you are writing for). + +**username** + +## V + +**version** + +**v., vs., versus** + +Do not use. Use _compared_ to or _compared with_ instead. + +**via** + +Do not use. Replace with by using, through, or with or a more specific phrase such as by accessing or by choosing. + +## W + +**web** + +**webpage** + +Never _web page_. + +**website** + +Never _web site_. + +**while, although, whereas** + +Only use _while_ to mean “during an interval of time.” Don’t use it to mean although because it is often ambiguous. _Whereas_ is a better alternative to although in many cases, but it can sound overly formal. + +**white day** + +Do not use. Use *open day* instead. + +**whitelist** + +Do not use. Use *allow list* instead. + +**white space** + +**wish, want, desire, need** + +_Wish_ and _desire_ are indirect and nuanced versions of _want_. Don’t use them. Be direct. + +Do not confuse wants with needs. Use the term that’s appropriate to the situation. _Need_ connotes a requirement or obligation, whereas _want_ indicates that you have an intent but still a choice of valid actions. + +## Y + +**Yellowdog Updater Modified (YUM)** + +An open-source tool for command-line and graphical-based package management for RPM (RedHat Package Manager)-based Linux systems. \ No newline at end of file diff --git a/_about/breaking-changes.md b/_about/breaking-changes.md new file mode 100644 index 00000000..9c0bd513 --- /dev/null +++ b/_about/breaking-changes.md @@ -0,0 +1,44 @@ +--- +layout: default +title: Breaking changes +nav_order: 5 +permalink: /breaking-changes/ +--- + +## 1.x + +### Migrating to OpenSearch and limits on the number of nested JSON objects + +Migrating from Elasticsearch OSS version 6.8 to OpenSearch version 1.x will fail when a cluster contains any document that includes more than 10,000 nested JSON objects across all fields. Elasticsearch version 7.0 introduced the `index.mapping.nested_objects.limit` setting to guard against out-of-memory errors and assigned the setting a default of `10000`. OpenSearch adopted this setting at its inception and enforces the limitation on nested JSON objects. However, because the setting is not present in Elasticsearch 6.8 and not recognized by this version, migration to OpenSearch 1.x can result in incompatibility issues that block shard relocation between Elasticsearch 6.8 and OpenSearch versions 1.x when the number of nested JSON objects in any document surpasses the default limit. + +Therefore, we recommend evaluating your data for these limits before attempting to migrate from Elasticsearch 6.8. + + +## 2.0.0 + +### Remove mapping types parameter + +The `type` parameter has been removed from all OpenSearch API endpoints. Instead, indexes can be categorized by document type. For more details, see issue [#1940](https://github.com/opensearch-project/opensearch/issues/1940). + +### Deprecate non-inclusive terms + +Non-inclusive terms are deprecated in version 2.x and will be permanently removed in OpenSearch 3.0. We are using the following replacements: + +- "Whitelist" is now "Allow list" +- "Blacklist" is now "Deny list" +- "Master" is now "Cluster Manager" + +### Add OpenSearch Notifications plugins + +In OpenSearch 2.0, the Alerting plugin is now integrated with new plugins for Notifications. If you want to continue to use the notification action in the Alerting plugin, install the new backend plugins `notifications-core` and `notifications`. If you want to manage notifications in OpenSearch Dashboards, use the new `notificationsDashboards` plugin. For more information, see [Notifications]({{site.url}}{{site.baseurl}}/observing-your-data/notifications/index/) on the OpenSearch documentation page. + +### Drop support for JDK 8 + +A Lucene upgrade forced OpenSearch to drop support for JDK 8. As a consequence, the [Java high-level REST client]({{site.url}}{{site.baseurl}}/clients/java-rest-high-level/) no longer supports JDK 8. Restoring JDK 8 support is currently an `opensearch-java` proposal [#156](https://github.com/opensearch-project/opensearch-java/issues/156) and will require removing OpenSearch core as a dependency from the Java client (issue [#262](https://github.com/opensearch-project/opensearch-java/issues/262)). + + +## 2.5.0 + +### Wildcard query behavior for text fields + +OpenSearch 2.5 contains a bug fix to correct the behavior of the `case_insensitive` parameter for the `wildcard` query on text fields. As a result, a wildcard query on text fields that ignored case sensitivity and erroneously returned results prior to the bug fix will not return the same results. For more information, see issue [#8711](https://github.com/opensearch-project/OpenSearch/issues/8711). \ No newline at end of file diff --git a/_about/index.md b/_about/index.md new file mode 100644 index 00000000..d2cc011b --- /dev/null +++ b/_about/index.md @@ -0,0 +1,85 @@ +--- +layout: default +title: Getting started +nav_order: 1 +has_children: false +has_toc: false +nav_exclude: true +permalink: /about/ +redirect_from: + - /docs/opensearch/ + - /opensearch/ + - /opensearch/index/ +--- + +{%- comment -%}The `/docs/opensearch/` redirect is specifically to support the UI links in OpenSearch Dashboards 1.0.0.{%- endcomment -%} + +# OpenSearch and OpenSearch Dashboards +**Version {{site.opensearch_major_minor_version}}** +{: .label .label-blue } + +This section contains documentation for OpenSearch and OpenSearch Dashboards. + +## Getting started + +- [Intro to OpenSearch]({{site.url}}{{site.baseurl}}/intro/) +- [Quickstart]({{site.url}}{{site.baseurl}}/quickstart/) +- [Install OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/) +- [Install OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/index/) +- [See the FAQ](https://opensearch.org/faq) + +## Why use OpenSearch? + +With OpenSearch, you can perform the following use cases: + + + + + + + + + + + + + + + + + + + + + + +
Fast, scalable full-text searchApplication and infrastructure monitoringSecurity and event information managementOperational health tracking
Fast, Scalable Full-text SearchApplication and Infrastructure MonitoringSecurity and Event Information ManagementOperational Health Tracking
Help users find the right information within your application, website, or data lake catalog. Easily store and analyze log data, and set automated alerts for underperformance.Centralize logs to enable real-time security monitoring and forensic analysis.Use observability logs, metrics, and traces to monitor your applications and business in real time.
+ +**Additional features and plugins:** + +OpenSearch has several features and plugins to help index, secure, monitor, and analyze your data. Most OpenSearch plugins have corresponding OpenSearch Dashboards plugins that provide a convenient, unified user interface. +- [Anomaly detection]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/) - Identify atypical data and receive automatic notifications +- [KNN]({{site.url}}{{site.baseurl}}/search-plugins/knn/) - Find “nearest neighbors” in your vector data +- [Performance Analyzer]({{site.url}}{{site.baseurl}}/monitoring-plugins/pa/) - Monitor and optimize your cluster +- [SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/index/) - Use SQL or a piped processing language to query your data +- [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/) - Automate index operations +- [ML Commons plugin]({{site.url}}{{site.baseurl}}/ml-commons-plugin/index/) - Train and execute machine-learning models +- [Asynchronous search]({{site.url}}{{site.baseurl}}/search-plugins/async/) - Run search requests in the background +- [Cross-cluster replication]({{site.url}}{{site.baseurl}}/replication-plugin/index/) - Replicate your data across multiple OpenSearch clusters + + +## The secure path forward +OpenSearch includes a demo configuration so that you can get up and running quickly, but before using OpenSearch in a production environment, you must [configure the Security plugin manually]({{site.url}}{{site.baseurl}}/security/configuration/index/) with your own certificates, authentication method, users, and passwords. + +## Looking for the Javadoc? + +See [opensearch.org/javadocs/](https://opensearch.org/javadocs/). + +## Get involved + +[OpenSearch](https://opensearch.org) is supported by Amazon Web Services. All components are available under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0.html) on [GitHub](https://github.com/opensearch-project/). +The project welcomes GitHub issues, bug fixes, features, plugins, documentation---anything at all. To get involved, see [Contributing](https://opensearch.org/source.html) on the OpenSearch website. + +--- + +OpenSearch includes certain Apache-licensed Elasticsearch code from Elasticsearch B.V. and other source code. Elasticsearch B.V. is not the source of that other source code. ELASTICSEARCH is a registered trademark of Elasticsearch B.V. \ No newline at end of file diff --git a/_opensearch/index.md b/_about/intro.md similarity index 63% rename from _opensearch/index.md rename to _about/intro.md index 505faeec..ef1dc497 100644 --- a/_opensearch/index.md +++ b/_about/intro.md @@ -1,23 +1,17 @@ --- layout: default -title: About OpenSearch -nav_order: 1 -has_children: false -has_toc: false -redirect_from: - - /docs/opensearch/ - - /opensearch/ +title: Intro to OpenSearch +nav_order: 2 +permalink: /intro/ --- -{%- comment -%}The `/docs/opensearch/` redirect is specifically to support the UI links in OpenSearch Dashboards 1.0.0.{%- endcomment -%} - # Introduction to OpenSearch -OpenSearch is a distributed search and analytics engine based on [Apache Lucene](https://lucene.apache.org/). After adding your data to OpenSearch, you can perform full-text searches on it with all of the features you might expect: search by field, search multiple indices, boost fields, rank results by score, sort results by field, and aggregate results. +OpenSearch is a distributed search and analytics engine based on [Apache Lucene](https://lucene.apache.org/). After adding your data to OpenSearch, you can perform full-text searches on it with all of the features you might expect: search by field, search multiple indexes, boost fields, rank results by score, sort results by field, and aggregate results. Unsurprisingly, people often use search engines like OpenSearch as the backend for a search application---think [Wikipedia](https://en.wikipedia.org/wiki/Wikipedia:FAQ/Technical#What_software_is_used_to_run_Wikipedia?) or an online store. It offers excellent performance and can scale up and down as the needs of the application grow or shrink. -An equally popular, but less obvious use case is log analytics, in which you take the logs from an application, feed them into OpenSearch, and use the rich search and visualization functionality to identify issues. For example, a malfunctioning web server might throw a 500 error 0.5% of the time, which can be hard to notice unless you have a real-time graph of all HTTP status codes that the server has thrown in the past four hours. You can use [OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/) to build these sorts of visualizations from data in OpenSearch. +An equally popular, but less obvious use case is log analytics, in which you take the logs from an application, feed them into OpenSearch, and use the rich search and visualization functionality to identify issues. For example, a malfunctioning web server might throw a 500 error 0.5% of the time, which can be hard to notice unless you have a real-time graph of all HTTP status codes that the server has thrown in the past four hours. You can use [OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/) to build these sorts of visualizations from data in OpenSearch. ## Clusters and nodes @@ -29,9 +23,9 @@ You can run OpenSearch locally on a laptop---its system requirements are minimal In a single node cluster, such as a laptop, one machine has to do everything: manage the state of the cluster, index and search data, and perform any preprocessing of data prior to indexing it. As a cluster grows, however, you can subdivide responsibilities. Nodes with fast disks and plenty of RAM might be great at indexing and searching data, whereas a node with plenty of CPU power and a tiny disk could manage cluster state. For more information on setting node types, see [Cluster formation]({{site.url}}{{site.baseurl}}/opensearch/cluster/). -## Indices and documents +## Indexes and documents -OpenSearch organizes data into *indices*. Each index is a collection of JSON *documents*. If you have a set of raw encyclopedia articles or log lines that you want to add to OpenSearch, you must first convert them to [JSON](https://www.json.org/). A simple JSON document for a movie might look like this: +OpenSearch organizes data into *indexes*. Each index is a collection of JSON *documents*. If you have a set of raw encyclopedia articles or log lines that you want to add to OpenSearch, you must first convert them to [JSON](https://www.json.org/). A simple JSON document for a movie might look like this: ```json { @@ -55,14 +49,14 @@ When you add the document to an index, OpenSearch adds some metadata, such as th } ``` -Indices also contain mappings and settings: +Indexes also contain mappings and settings: - A *mapping* is the collection of *fields* that documents in the index have. In this case, those fields are `title` and `release_date`. - Settings include data like the index name, creation date, and number of shards. ## Primary and replica shards -OpenSearch splits indices into *shards* for even distribution across nodes in a cluster. For example, a 400 GB index might be too large for any single node in your cluster to handle, but split into ten shards, each one 40 GB, OpenSearch can distribute the shards across ten nodes and work with each shard individually. +OpenSearch splits indexes into *shards* for even distribution across nodes in a cluster. For example, a 400 GB index might be too large for any single node in your cluster to handle, but split into ten shards, each one 40 GB, OpenSearch can distribute the shards across ten nodes and work with each shard individually. By default, OpenSearch creates a *replica* shard for each *primary* shard. If you split your index into ten shards, for example, OpenSearch also creates ten replica shards. These replica shards act as backups in the event of a node failure---OpenSearch distributes replica shards to different nodes than their corresponding primary shards---but they also improve the speed and rate at which the cluster can process search requests. You might specify more than one replica per index for a search-heavy workload. @@ -71,7 +65,7 @@ Despite being a piece of an OpenSearch index, each shard is actually a full Luce ## REST API -You interact with OpenSearch clusters using the REST API, which offers a lot of flexibility. You can use clients like [curl](https://curl.haxx.se/) or any programming language that can send HTTP requests. To add a JSON document to an OpenSearch index (i.e. index a document), you send an HTTP request: +You interact with OpenSearch clusters using the REST API, which offers a lot of flexibility. You can use clients like [curl](https://curl.se/) or any programming language that can send HTTP requests. To add a JSON document to an OpenSearch index (i.e. index a document), you send an HTTP request: ```json PUT https://://_doc/ @@ -83,14 +77,36 @@ PUT https://://_doc/ To run a search for the document: -``` +```json GET https://://_search?q=wind ``` To delete the document: -``` +```json DELETE https://://_doc/ ``` -You can change most OpenSearch settings using the REST API, modify indices, check the health of the cluster, get statistics---almost everything. +You can change most OpenSearch settings using the REST API, modify indexes, check the health of the cluster, get statistics---almost everything. + +## Advanced concepts + +The following section describes more advanced OpenSearch concepts. + +### Translog + +Any index changes, such as document indexing or deletion, are written to disk during a Lucene commit. However, Lucene commits are expensive operations, so they cannot be performed after every change to the index. Instead, each shard records every indexing operation in a transaction log called _translog_. When a document is indexed, it is added to the memory buffer and recorded in the translog. After a process or host restart, any data in the in-memory buffer is lost. Recording the document in the translog ensures durability because the translog is written to disk. + +Frequent refresh operations write the documents in the memory buffer to a segment and then clear the memory buffer. Periodically, a [flush](#flush) performs a Lucene commit, which includes writing the segments to disk using `fsync`, purging the old translog, and starting a new translog. Thus, a translog contains all operations that have not yet been flushed. + +### Refresh + +Periodically, OpenSearch performs a _refresh_ operation, which writes the documents from the in-memory Lucene index to files. These files are not guaranteed to be durable because an `fsync` is not performed. A refresh makes documents available for search. + +### Flush + +A _flush_ operation persists the files to disk using `fsync`, ensuring durability. Flushing ensures that the data stored only in the translog is recorded in the Lucene index. OpenSearch performs a flush as needed to ensure that the translog does not grow too large. + +### Merge + +In OpenSearch, a shard is a Lucene index, which consists of _segments_ (or segment files). Segments store the indexed data and are immutable. Periodically, smaller segments are merged into larger ones. Merging reduces the overall number of segments on each shard, frees up disk space, and improves search performance. Eventually, segments reach a maximum size specified in the merge policy and are no longer merged into larger segments. The merge policy also specifies how often merges are performed. \ No newline at end of file diff --git a/_about/quickstart.md b/_about/quickstart.md new file mode 100644 index 00000000..5c7da295 --- /dev/null +++ b/_about/quickstart.md @@ -0,0 +1,165 @@ +--- +layout: default +title: Quickstart +nav_order: 3 +permalink: /quickstart/ +redirect_from: + - /opensearch/install/quickstart/ +--- + +# Quickstart + +Get started using OpenSearch and OpenSearch Dashboards by deploying your containers with [Docker](https://www.docker.com/). Before proceeding, you need to [get Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://github.com/docker/compose) installed on your local machine. + +The Docker Compose commands used in this guide are written with a hyphen (for example, `docker-compose`). If you installed Docker Desktop on your machine, which automatically installs a bundled version of Docker Compose, then you should remove the hyphen. For example, change `docker-compose` to `docker compose`. +{: .note} + +## Starting your cluster + +You'll need a special file, called a Compose file, that Docker Compose uses to define and create the containers in your cluster. The OpenSearch Project provides a sample Compose file that you can use to get started. Learn more about working with Compose files by reviewing the official [Compose specification](https://docs.docker.com/compose/compose-file/). + +1. Before running OpenSearch on your machine, you should disable memory paging and swapping performance on the host to improve performance and increase the number of memory maps available to OpenSearch. See [important system settings]({{site.url}}{{site.baseurl}}/opensearch/install/important-settings/) for more information. + ```bash + # Disable memory paging and swapping. + sudo swapoff -a + + # Edit the sysctl config file that defines the host's max map count. + sudo vi /etc/sysctl.conf + + # Set max map count to the recommended value of 262144. + vm.max_map_count=262144 + + # Reload the kernel parameters. + sudo sysctl -p + ``` +1. Download the sample Compose file to your host. You can download the file with command line utilities like `curl` and `wget`, or you can manually copy [docker-compose.yml](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/docker-compose.yml) from the OpenSearch Project documentation-website repository using a web browser. + ```bash + # Using cURL: + curl -O https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/docker-compose.yml + + # Using wget: + wget https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/docker-compose.yml + ``` +1. In your terminal application, navigate to the directory containing the `docker-compose.yml` file you just downloaded, and run the following command to create and start the cluster as a background process. + ```bash + docker-compose up -d + ``` +1. Confirm that the containers are running with the command `docker-compose ps`. You should see an output like the following: + ```bash + $ docker-compose ps + NAME COMMAND SERVICE STATUS PORTS + opensearch-dashboards "./opensearch-dashbo…" opensearch-dashboards running 0.0.0.0:5601->5601/tcp + opensearch-node1 "./opensearch-docker…" opensearch-node1 running 0.0.0.0:9200->9200/tcp, 9300/tcp, 0.0.0.0:9600->9600/tcp, 9650/tcp + opensearch-node2 "./opensearch-docker…" opensearch-node2 running 9200/tcp, 9300/tcp, 9600/tcp, 9650/tcp + ``` +1. Query the OpenSearch REST API to verify that the service is running. You should use `-k` (also written as `--insecure`) to disable hostname checking because the default security configuration uses demo certificates. Use `-u` to pass the default username and password (`admin:`). + ```bash + curl https://localhost:9200 -ku admin: + ``` + Sample response: + ```json + { + "name" : "opensearch-node1", + "cluster_name" : "opensearch-cluster", + "cluster_uuid" : "W0B8gPotTAajhMPbC9D4ww", + "version" : { + "distribution" : "opensearch", + "number" : "2.6.0", + "build_type" : "tar", + "build_hash" : "7203a5af21a8a009aece1474446b437a3c674db6", + "build_date" : "2023-02-24T18:58:37.352296474Z", + "build_snapshot" : false, + "lucene_version" : "9.5.0", + "minimum_wire_compatibility_version" : "7.10.0", + "minimum_index_compatibility_version" : "7.0.0" + }, + "tagline" : "The OpenSearch Project: https://opensearch.org/" + } + ``` +1. Explore OpenSearch Dashboards by opening `http://localhost:5601/` in a web browser on the same host that is running your OpenSearch cluster. The default username is `admin` and the default password is set in your `docker-compose.yml` file in the `OPENSEARCH_INITIAL_ADMIN_PASSWORD=` setting. + +## Create an index and field mappings using sample data + +Create an index and define field mappings using a dataset provided by the OpenSearch Project. The same fictitious e-commerce data is also used for sample visualizations in OpenSearch Dashboards. To learn more, see [Getting started with OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/). + +1. Download [ecommerce-field_mappings.json](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce-field_mappings.json). This file defines a [mapping]({{site.url}}{{site.baseurl}}/opensearch/mappings/) for the sample data you will use. + ```bash + # Using cURL: + curl -O https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce-field_mappings.json + + # Using wget: + wget https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce-field_mappings.json + ``` +1. Download [ecommerce.json](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json). This file contains the index data formatted so that it can be ingested by the bulk API. To learn more, see [index data]({{site.url}}{{site.baseurl}}/opensearch/index-data/) and [Bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/). + ```bash + # Using cURL: + curl -O https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json + + # Using wget: + wget https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json + ``` +1. Define the field mappings with the mapping file. + ```bash + curl -H "Content-Type: application/x-ndjson" -X PUT "https://localhost:9200/ecommerce" -ku admin: --data-binary "@ecommerce-field_mappings.json" + ``` +1. Upload the index to the bulk API. + ```bash + curl -H "Content-Type: application/x-ndjson" -X PUT "https://localhost:9200/ecommerce/_bulk" -ku admin: --data-binary "@ecommerce.json" + ``` +1. Query the data using the search API. The following command submits a query that will return documents where `customer_first_name` is `Sonya`. + ```bash + curl -H 'Content-Type: application/json' -X GET "https://localhost:9200/ecommerce/_search?pretty=true" -ku admin: -d' {"query":{"match":{"customer_first_name":"Sonya"}}}' + ``` + Queries submitted to the OpenSearch REST API will generally return a flat JSON by default. For a human readable response body, use the query parameter `pretty=true`. For more information about `pretty` and other useful query parameters, see [Common REST parameters]({{site.url}}{{site.baseurl}}/opensearch/common-parameters/). +1. Access OpenSearch Dashboards by opening `http://localhost:5601/` in a web browser on the same host that is running your OpenSearch cluster. The default username is `admin` and the password is set in your `docker-compose.yml` file in the `OPENSEARCH_INITIAL_ADMIN_PASSWORD=` setting. +1. On the top menu bar, go to **Management > Dev Tools**. +1. In the left pane of the console, enter the following: + ```json + GET ecommerce/_search + { + "query": { + "match": { + "customer_first_name": "Sonya" + } + } + } + ``` +1. Choose the triangle icon at the top right of the request to submit the query. You can also submit the request by pressing `Ctrl+Enter` (or `Cmd+Enter` for Mac users). To learn more about using the OpenSearch Dashboards console for submitting queries, see [Running queries in the console]({{site.url}}{{site.baseurl}}/dashboards/run-queries/). + +## Next steps + +You successfully deployed your own OpenSearch cluster with OpenSearch Dashboards and added some sample data. Now you're ready to learn about configuration and functionality in more detail. Here are a few recommendations on where to begin: +- [About the Security plugin]({{site.url}}{{site.baseurl}}/security/index/) +- [OpenSearch configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/) +- [OpenSearch plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) +- [Getting started with OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/) +- [OpenSearch tools]({{site.url}}{{site.baseurl}}/tools/index/) +- [Index APIs]({{site.url}}{{site.baseurl}}/api-reference/index-apis/index/) + +## Common issues + +Review these common issues and suggested solutions if your containers fail to start or exit unexpectedly. + +### Docker commands require elevated permissions + +Eliminate the need for running your Docker commands with `sudo` by adding your user to the `docker` user group. See Docker's [Post-installation steps for Linux](https://docs.docker.com/engine/install/linux-postinstall/) for more information. +```bash +sudo usermod -aG docker $USER +``` + +### Error message: "-bash: docker-compose: command not found" + +If you installed Docker Desktop, then Docker Compose is already installed on your machine. Try `docker compose` (without the hyphen) instead of `docker-compose`. See [Use Docker Compose](https://docs.docker.com/get-started/08_using_compose/). + +### Error message: "docker: 'compose' is not a docker command." + +If you installed Docker Engine, then you must install Docker Compose separately, and you will use the command `docker-compose` (with a hyphen). See [Docker Compose](https://github.com/docker/compose). + +### Error message: "max virtual memory areas vm.max_map_count [65530] is too low" + +OpenSearch will fail to start if your host's `vm.max_map_count` is too low. Review the [important system settings]({{site.url}}{{site.baseurl}}/opensearch/install/important-settings/) if you see the following errors in the service log, and set `vm.max_map_count` appropriately. +```bash +opensearch-node1 | ERROR: [1] bootstrap checks failed +opensearch-node1 | [1]: max virtual memory areas vm.max_map_count [65530] is too low, increase to at least [262144] +opensearch-node1 | ERROR: OpenSearch did not exit normally - check the logs at /usr/share/opensearch/logs/opensearch-cluster.log +``` diff --git a/_about/version-history.md b/_about/version-history.md new file mode 100644 index 00000000..25e34556 --- /dev/null +++ b/_about/version-history.md @@ -0,0 +1,53 @@ +--- +layout: default +title: Version history +nav_order: 4 +permalink: /version-history/ +--- + +# Version history + +OpenSearch version | Release highlights | Release date +:--- | :--- | :--- +[2.12.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.12.0.md) | Makes concurrent segment search and conversational search generally available. Provides an experimental OpenSearch Assistant Toolkit, including agents and tools, workflow automation, and OpenSearch Assistant for OpenSearch Dashboards UI. Adds a new match-only text field, query insights to monitor top N queries, and k-NN search on nested fields. For a full list of release highlights, see the Release Notes. | 20 February 2024 +[2.11.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.11.1.md) | Includes maintenance changes and bug fixes for cross-cluster replication, alerting, observability, OpenSearch Dashboards, index management, machine learning, security, and security analytics. For a full list of release highlights, see the Release Notes. | 30 November 2023 +[2.11.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.11.0.md) | Adds multimodal and sparse neural search capability and the ability to take shallow snapshots that refer to data stored in remote-backed storage. Makes the search comparison tool generally available. Includes a simplified workflow to create threat detectors in Security Analytics and improved security in OpenSearch Dashboards. Experimental features include a new framework and toolset for distributed tracing and updates to conversational search. For a full list of release highlights, see the Release Notes. | 16 October 2023 +[2.10.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.10.0.md) | Makes remote-backed storage generally available. Adds hybrid search capability, custom log types for Security Analytics, IP2Geo ingest processor, and delimited term frequency token filter. Includes a new look and feel for OpenSearch Dashboards and updates the Discover tool. Adds Microsoft Teams webhook support for notifications. Experimental features include concurrent segment search and conversational search. For a full list of release highlights, see the Release Notes. | 25 September 2023 +[2.9.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.9.0.md) | Makes search pipelines and the Neural Search plugin generally available. Adds ML model access control and integration with external ML tools. Implements k-NN byte vectors and efficient filtering with the Faiss engine. Integrates alerting and anomaly detection with OpenSearch Dashboards and adds composite monitors. Adds two new index codec algorithm options. Includes a new ingestion schema for Security Analytics, geoshape aggregations, and extensions---a new mechanism for extending OpenSearch functionality. For a full list of release highlights, see the Release Notes. | 24 July 2023 +[2.8.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.8.0.md) | Adds cross-cluster query with PPL, search pipelines, an option to turn on segment replication as the default replication type, improved searchable snapshot performance, and Amazon OpenSearch Serverless support with SigV4 authentication for multiple data sources. Includes the UI for the flush, refresh, and clear cache operations in OpenSearch Dashboards. For a full list of release highlights, see the Release Notes. | 06 June 2023 +[2.7.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.7.0.md) | Includes searchable snapshots and segment replication, which are now generally available. Adds multiple data sources, observability features, dynamic tenant management, component templates, and shape-based map filters in OpenSearch Dashboards. Includes the flat object field type, hot shard identification, and a new automatic reloading mechanism for ML models. For a full list of release highlights, see the Release Notes. | 02 May 2023 +[2.6.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.6.0.md) | Includes simple schema for observability, index management UI enhancements, Security Analytics enhancements, search backpressure at the coordinator node level, and the ability to add maps to dashboards. Experimental features include a new ML model health dashboard, new text embedding models in ML, and SigV4 authentication in Dashboards. For a full list of release highlights, see the Release Notes. | 28 February 2023 +[2.5.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.5.0.md) | Includes index management UI enhancements, multi-layer maps, Jaeger support for observability, Debian distributions, returning cluster health by awareness attribute, cluster manager task throttling, weighted zonal search request routing policy, and query string support in index rollups. Experimental features include request-level durability in remote-backed storage and GPU acceleration for ML nodes. For a full list of release highlights, see the Release Notes. | 24 January 2023 +[2.4.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.4.1.md) | Includes maintenance changes and bug fixes for gradle check and indexing pressure tests. Adds support for skipping changelog. | 13 December 2022 +[2.4.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.4.0.md) | Includes Windows support, Point-in-time search, custom k-NN filtering, xy_point and xy_shape field types for Cartesian coordinates, GeoHex grid aggregation, and resilience enhancements, including search backpressure. In OpenSearch Dashboards, this release adds snapshot restore functionality, multiple authentication, and aggregate view of saved objects. This release includes the following experimental features: searchable snapshots, Compare Search Results, multiple data sources in OpenSearch Dashboards, a new Model Serving Framework in ML Commons, a new Neural Search plugin that supports semantic search, and a new Security Analytics plugin to analyze security logs. For a full list of release highlights, see the Release Notes. | 15 November 2022 +[2.3.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.3.0.md) | This release includes the following experimental features: segment replication, remote-backed storage, and drag and drop for OpenSearch Dashboards. Experimental features allow you to test new functionality in OpenSearch. Because these features are still being developed, your testing and feedback can help shape the development of the feature before it's official released. We do not recommend use of experimental features in production. Additionally, this release adds maketime and makedate datetime functions for the SQL plugin. Creates a new [OpenSearch Playground](https://playground.opensearch.org) demo site for OpenSearch Dashboards. For a full list of release highlights, see the Release Notes. | 14 September 2022 +[2.2.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.2.1.md) | Includes gradle updates and bug fixes for gradle check. | 01 September 2022 +[2.2.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.2.0.md) | Includes support for Logistic Regression and RCF Summarize machine learning algorithms in ML Commons, Lucene or C-based Nmslib and Faiss libraries for approximate k-NN search, search by relevance using SQL and PPL queries, custom region maps for visualizations, and rollup enhancements. For a full list of release highlights, see the Release Notes. | 11 August 2022 +[2.1.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.1.0.md) | Includes support for dedicated ML node in the ML Commons plugin, relevance search and other features in SQL, multi-terms aggregation, and Snapshot Management. For a full list of release highlights, see the Release Notes. | 07 July 2022 +[2.0.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.0.1.md) | Includes bug fixes and maintenance updates for Alerting and Anomaly Detection. | 16 June 2022 +[2.0.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.0.0.md) | Includes document-level monitors for alerting, OpenSearch Notifications plugins, and Geo Map Tiles in OpenSearch Dashboards. Also adds support for Lucene 9 and bug fixes for all OpenSearch plugins. For a full list of release highlights, see the Release Notes. | 26 May 2022 +[2.0.0-rc1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.0.0-rc1.md) | The Release Candidate for 2.0.0. This version allows you to preview the upcoming 2.0.0 release before the GA release. The preview release adds document-level alerting, support for Lucene 9, and the ability to use term lookup queries in document level security. | 03 May 2022 +[1.3.15](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.15.md) | Includes bug fixes and maintenance updates for cross-cluster replication, SQL, OpenSearch Dashboards reporting, and alerting. | 05 March 2024 +[1.3.14](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.14.md) | Includes bug fixes and maintenance updates for OpenSearch security and OpenSearch Dashboards security. | 12 December 2023 +[1.3.13](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.13.md) | Includes bug fixes for Anomaly Detection, adds maintenance updates and infrastructure enhancements. | 21 September 2023 +[1.3.12](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.12.md) | Adds maintenance updates for OpenSearch security and OpenSearch Dashboards observability. Includes bug fixes for observability, OpenSearch Dashboards visualizations, and OpenSearch security. | 10 August 2023 +[1.3.11](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.11.md) | Adds maintenance updates for OpenSearch security, OpenSearch Dashboards security, and ML Commons. | 29 June 2023 +[1.3.10](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.10.md) | Adds infrastructure enhancements and maintenance updates for anomaly detection, observability, and security. Includes bug fixes for index management and OpenSearch security. | 18 May 2023 +[1.3.9](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.9.md) | Adds Debian support. Includes upgrades, enhancements, and maintenance updates for OpenSearch core, k-NN, and OpenSearch security. | 16 March 2023 +[1.3.8](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.8.md) | Adds OpenSearch security enhancements. Updates tool scripts to run on Windows. Includes maintenance updates and bug fixes for Anomaly Detection and OpenSearch security. | 02 February 2023 +[1.3.7](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.7.md) | Adds Windows support. Includes maintenance updates and bug fixes for error handling. | 13 December 2022 +[1.3.6](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.6.md) | Includes maintenance updates and bug fixes for tenancy in the OpenSearch Security Dashboards plugin. | 06 October 2022 +[1.3.5](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.5.md) | Includes maintenance updates and bug fixes for gradle check and OpenSearch security. | 01 September 2022 +[1.3.4](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.4.md) | Includes maintenance updates and bug fixes for OpenSearch and OpenSearch Dashboards. | 14 July 2022 +[1.3.3](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.3.md) | Adds enhancements to Anomaly Detection and ML Commons. Bug fixes for Anomaly Detection, Observability, and k-NN. | 09 June 2022 +[1.3.2](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.2.md) | Bug fixes for Anomaly Detection and the Security Dashboards Plugin, adds the option to install OpenSearch using RPM, as well as enhancements to the ML Commons execute task, and the removal of the job-scheduler zip in Anomaly Detection. | 05 May 2022 +[1.3.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.1.md) | Bug fixes when using document-level security, and adjusted ML Commons to use the latest RCF jar and protostuff to RCF model serialization. | 30 March 2022 +[1.3.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.0.md) | Adds Model Type Validation to Validate Detector API, continuous transforms, custom actions, applied policy parameter to Explain API, default action retries, and new rollover and transition conditions to Index Management, new ML Commons plugin, parse command to SQL, Application Analytics, Live Tail, Correlation, and Events Flyout to Observability, and auto backport and support for OPENSEARCH_JAVA_HOME to Performance Analyzer. Bug fixes. | 17 March 2022 +[1.2.4](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.2.4.md) | Updates Performance Analyzer, SQL, and Security plugins to Log4j 2.17.1, Alerting and Job Scheduler to cron-utils 9.1.6, and gson in Anomaly Detection and SQL. | 18 January 2022 +[1.2.3](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.2.3.md) | Updates the version of Log4j used in OpenSearch to Log4j 2.17.0 as recommended by the advisory in [CVE-2021-45105](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-45105). | 22 December 2021 +[1.2.0](https://github.com/opensearch-project/OpenSearch/blob/main/release-notes/opensearch.release-notes-1.2.0.md) | Adds observability, new validation API for Anomaly Detection, shard-level indexing back-pressure, new "match" query type for SQL and PPL, support for Faiss libraries in k-NN, and custom Dashboards branding. | 23 November 2021 +[1.1.0](https://github.com/opensearch-project/opensearch-build/tree/main/release-notes/opensearch-release-notes-1.1.0.md) | Adds cross-cluster replication, security for Index Management, bucket-level alerting, a CLI to help with upgrading from Elasticsearch OSS to OpenSearch, and enhancements to high cardinality data in the anomaly detection plugin. | 05 October 2021 +[1.0.1](https://github.com/opensearch-project/opensearch-build/tree/main/release-notes/opensearch-release-notes-1.0.1.md) | Bug fixes. | 01 September 2021 +[1.0.0](https://github.com/opensearch-project/opensearch-build/tree/main/release-notes/opensearch-release-notes-1.0.0.md) | General availability release. Adds compatibility setting for clients that require a version check before connecting. | 12 July 2021 +[1.0.0-rc1](https://github.com/opensearch-project/opensearch-build/tree/main/release-notes/opensearch-release-notes-1.0.0-rc1.md) | First release candidate. | 07 June 2021 +[1.0.0-beta1](https://github.com/opensearch-project/opensearch-build/tree/main/release-notes/opensearch-release-notes-1.0.0-beta1.md) | Initial beta release. Refactors plugins to work with OpenSearch. | 13 May 2021 diff --git a/_aggregations/bucket/adjacency-matrix.md b/_aggregations/bucket/adjacency-matrix.md new file mode 100644 index 00000000..fd521f85 --- /dev/null +++ b/_aggregations/bucket/adjacency-matrix.md @@ -0,0 +1,101 @@ +--- +layout: default +title: Adjacency matrix +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 10 +redirect_from: + - /query-dsl/aggregations/bucket/adjacency-matrix/ +--- + +# Adjacency matrix aggregations + +The `adjacency_matrix` aggregation lets you define filter expressions and returns a matrix of the intersecting filters where each non-empty cell in the matrix represents a bucket. You can find how many documents fall within any combination of filters. + +Use the `adjacency_matrix` aggregation to discover how concepts are related by visualizing the data as graphs. + +For example, in the sample eCommerce dataset, to analyze how the different manufacturing companies are related: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "interactions": { + "adjacency_matrix": { + "filters": { + "grpA": { + "match": { + "manufacturer.keyword": "Low Tide Media" + } + }, + "grpB": { + "match": { + "manufacturer.keyword": "Elitelligence" + } + }, + "grpC": { + "match": { + "manufacturer.keyword": "Oceanavigations" + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + + ```json + { + ... + "aggregations" : { + "interactions" : { + "buckets" : [ + { + "key" : "grpA", + "doc_count" : 1553 + }, + { + "key" : "grpA&grpB", + "doc_count" : 590 + }, + { + "key" : "grpA&grpC", + "doc_count" : 329 + }, + { + "key" : "grpB", + "doc_count" : 1370 + }, + { + "key" : "grpB&grpC", + "doc_count" : 299 + }, + { + "key" : "grpC", + "doc_count" : 1218 + } + ] + } + } + } +``` + + Let’s take a closer look at the result: + + ```json + { + "key" : "grpA&grpB", + "doc_count" : 590 + } + ``` + +- `grpA`: Products manufactured by Low Tide Media. +- `grpB`: Products manufactured by Elitelligence. +- `590`: Number of products that are manufactured by both. + +You can use OpenSearch Dashboards to represent this data with a network graph. \ No newline at end of file diff --git a/_aggregations/bucket/date-histogram.md b/_aggregations/bucket/date-histogram.md new file mode 100644 index 00000000..e308104e --- /dev/null +++ b/_aggregations/bucket/date-histogram.md @@ -0,0 +1,61 @@ +--- +layout: default +title: Date histogram +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 20 +redirect_from: + - /query-dsl/aggregations/bucket/date-histogram/ +--- + +# Date histogram aggregations + +The `date_histogram` aggregation uses [date math]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/#date-math) to generate histograms for time-series data. + +For example, you can find how many hits your website gets per month: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "logs_per_month": { + "date_histogram": { + "field": "@timestamp", + "interval": "month" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "logs_per_month" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635 + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844 + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595 + } + ] + } +} +} +``` + +The response has three months worth of logs. If you graph these values, you can see the peak and valleys of the request traffic to your website month over month. diff --git a/_aggregations/bucket/date-range.md b/_aggregations/bucket/date-range.md new file mode 100644 index 00000000..c7d66d72 --- /dev/null +++ b/_aggregations/bucket/date-range.md @@ -0,0 +1,57 @@ +--- +layout: default +title: Date range +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 30 +redirect_from: + - /query-dsl/aggregations/bucket/date-range/ +--- + +# Date range aggregations + +The `date_range` aggregation is conceptually the same as the `range` aggregation, except that it lets you perform date math. +For example, you can get all documents from the last 10 days. To make the date more readable, include the format with a `format` parameter: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "number_of_bytes": { + "date_range": { + "field": "@timestamp", + "format": "MM-yyyy", + "ranges": [ + { + "from": "now-10d/d", + "to": "now" + } + ] + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "number_of_bytes" : { + "buckets" : [ + { + "key" : "03-2021-03-2021", + "from" : 1.6145568E12, + "from_as_string" : "03-2021", + "to" : 1.615451329043E12, + "to_as_string" : "03-2021", + "doc_count" : 0 + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/bucket/diversified-sampler.md b/_aggregations/bucket/diversified-sampler.md new file mode 100644 index 00000000..7249ac35 --- /dev/null +++ b/_aggregations/bucket/diversified-sampler.md @@ -0,0 +1,66 @@ +--- +layout: default +title: Diversified sampler +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 40 +redirect_from: + - /query-dsl/aggregations/bucket/diversified-sampler/ +--- + +# Diversified sampler + +The `diversified_sampler` aggregation lets you reduce the bias in the distribution of the sample pool by deduplicating documents containing the same `field`. It does so by using the `max_docs_per_value` and `field` settings, which limit the maximum number of documents collected on a shard for the provided `field`. The `max_docs_per_value` setting is an optional parameter used to determine the maximum number of documents that will be returned per `field`. The default value of this setting is `1`. + +Similarly to the [`sampler` aggregation]({{site.url}}{{site.baseurl}}/aggregations/bucket/sampler/), you can use the `shard_size` setting to control the maximum number of documents collected on any one shard, as shown in the following example: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sample": { + "diversified_": { + "shard_size": 1000, + "field": "response.keyword" + }, + "aggs": { + "terms": { + "terms": { + "field": "agent.keyword" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "sample" : { + "doc_count" : 3, + "terms" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1", + "doc_count" : 2 + }, + { + "key" : "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)", + "doc_count" : 1 + } + ] + } + } + + } +} +``` + diff --git a/_aggregations/bucket/filter.md b/_aggregations/bucket/filter.md new file mode 100644 index 00000000..0768ea11 --- /dev/null +++ b/_aggregations/bucket/filter.md @@ -0,0 +1,56 @@ +--- +layout: default +title: Filter +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 50 +redirect_from: + - /query-dsl/aggregations/bucket/filter/ +--- + +# Filter aggregations + +A `filter` aggregation is a query clause, exactly like a search query — `match` or `term` or `range`. You can use the `filter` aggregation to narrow down the entire set of documents to a specific set before creating buckets. + +The following example shows the `avg` aggregation running within the context of a filter. The `avg` aggregation only aggregates the documents that match the `range` query: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "low_value": { + "filter": { + "range": { + "taxful_total_price": { + "lte": 50 + } + } + }, + "aggs": { + "avg_amount": { + "avg": { + "field": "taxful_total_price" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "low_value" : { + "doc_count" : 1633, + "avg_amount" : { + "value" : 38.363175998928355 + } + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/bucket/filters.md b/_aggregations/bucket/filters.md new file mode 100644 index 00000000..b3977da7 --- /dev/null +++ b/_aggregations/bucket/filters.md @@ -0,0 +1,81 @@ +--- +layout: default +title: Filters +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 60 +redirect_from: + - /query-dsl/aggregations/bucket/filters/ +--- + +# Filters aggregations + +A `filters` aggregation is the same as the `filter` aggregation, except that it lets you use multiple filter aggregations. +While the `filter` aggregation results in a single bucket, the `filters` aggregation returns multiple buckets, one for each of the defined filters. + +To create a bucket for all the documents that didn't match the any of the filter queries, set the `other_bucket` property to `true`: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "200_os": { + "filters": { + "other_bucket": true, + "filters": [ + { + "term": { + "response.keyword": "200" + } + }, + { + "term": { + "machine.os.keyword": "osx" + } + } + ] + }, + "aggs": { + "avg_amount": { + "avg": { + "field": "bytes" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "200_os" : { + "buckets" : [ + { + "doc_count" : 12832, + "avg_amount" : { + "value" : 5897.852711970075 + } + }, + { + "doc_count" : 2825, + "avg_amount" : { + "value" : 5620.347256637168 + } + }, + { + "doc_count" : 1017, + "avg_amount" : { + "value" : 3247.0963618485744 + } + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/bucket/geo-distance.md b/_aggregations/bucket/geo-distance.md new file mode 100644 index 00000000..a111015a --- /dev/null +++ b/_aggregations/bucket/geo-distance.md @@ -0,0 +1,160 @@ +--- +layout: default +title: Geodistance +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 70 +redirect_from: + - /query-dsl/aggregations/bucket/geo-distance/ +--- + +# Geodistance aggregations + +The `geo_distance` aggregation groups documents into concentric circles based on distances from an origin `geo_point` field. +It's the same as the `range` aggregation, except that it works on geo locations. + +For example, you can use the `geo_distance` aggregation to find all pizza places within 1 km of you. The search results are limited to the 1 km radius specified by you, but you can add another result found within 2 km. + +You can only use the `geo_distance` aggregation on fields mapped as `geo_point`. + +A point is a single geographical coordinate, such as your current location shown by your smart-phone. A point in OpenSearch is represented as follows: + +```json +{ + "location": { + "type": "point", + "coordinates": { + "lat": 83.76, + "lon": -81.2 + } + } +} +``` + +You can also specify the latitude and longitude as an array `[-81.20, 83.76]` or as a string `"83.76, -81.20"` + +This table lists the relevant fields of a `geo_distance` aggregation: + +Field | Description | Required +:--- | :--- |:--- +`field` | Specify the geopoint field that you want to work on. | Yes +`origin` | Specify the geopoint that's used to compute the distances from. | Yes +`ranges` | Specify a list of ranges to collect documents based on their distance from the target point. | Yes +`unit` | Define the units used in the `ranges` array. The `unit` defaults to `m` (meters), but you can switch to other units like `km` (kilometers), `mi` (miles), `in` (inches), `yd` (yards), `cm` (centimeters), and `mm` (millimeters). | No +`distance_type` | Specify how OpenSearch calculates the distance. The default is `sloppy_arc` (faster but less accurate), but can also be set to `arc` (slower but most accurate) or `plane` (fastest but least accurate). Because of high error margins, use `plane` only for small geographic areas. | No + +The syntax is as follows: + +```json +{ + "aggs": { + "aggregation_name": { + "geo_distance": { + "field": "field_1", + "origin": "x, y", + "ranges": [ + { + "to": "value_1" + }, + { + "from": "value_2", + "to": "value_3" + }, + { + "from": "value_4" + } + ] + } + } + } +} +``` + +This example forms buckets from the following distances from a `geo-point` field: + +- Fewer than 10 km +- From 10 to 20 km +- From 20 to 50 km +- From 50 to 100 km +- Above 100 km + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "position": { + "geo_distance": { + "field": "geo.coordinates", + "origin": { + "lat": 83.76, + "lon": -81.2 + }, + "ranges": [ + { + "to": 10 + }, + { + "from": 10, + "to": 20 + }, + { + "from": 20, + "to": 50 + }, + { + "from": 50, + "to": 100 + }, + { + "from": 100 + } + ] + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "position" : { + "buckets" : [ + { + "key" : "*-10.0", + "from" : 0.0, + "to" : 10.0, + "doc_count" : 0 + }, + { + "key" : "10.0-20.0", + "from" : 10.0, + "to" : 20.0, + "doc_count" : 0 + }, + { + "key" : "20.0-50.0", + "from" : 20.0, + "to" : 50.0, + "doc_count" : 0 + }, + { + "key" : "50.0-100.0", + "from" : 50.0, + "to" : 100.0, + "doc_count" : 0 + }, + { + "key" : "100.0-*", + "from" : 100.0, + "doc_count" : 14074 + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/bucket/geohash-grid.md b/_aggregations/bucket/geohash-grid.md new file mode 100644 index 00000000..13f89799 --- /dev/null +++ b/_aggregations/bucket/geohash-grid.md @@ -0,0 +1,280 @@ +--- +layout: default +title: Geohash grid +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 80 +redirect_from: + - /query-dsl/aggregations/bucket/geohash-grid/ +--- + +# Geohash grid aggregations + +The `geohash_grid` aggregation buckets documents for geographical analysis. It organizes a geographical region into a grid of smaller regions of different sizes or precisions. Lower values of precision represent larger geographical areas, and higher values represent smaller, more precise geographical areas. You can aggregate documents on [geopoint]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point/) or [geoshape]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-shape/) fields using a geohash grid aggregation. One notable difference is that a geopoint is only present in one bucket, but a geoshape is counted in all geohash grid cells with which it intersects. + +The number of results returned by a query might be far too many to display each geopoint individually on a map. The `geohash_grid` aggregation buckets nearby geopoints together by calculating the geohash for each point, at the level of precision that you define (between 1 to 12; the default is 5). To learn more about geohash, see [Wikipedia](https://en.wikipedia.org/wiki/Geohash). + +The web logs example data is spread over a large geographical area, so you can use a lower precision value. You can zoom in on this map by increasing the precision value: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "geo_hash": { + "geohash_grid": { + "field": "geo.coordinates", + "precision": 4 + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "geo_hash" : { + "buckets" : [ + { + "key" : "c1cg", + "doc_count" : 104 + }, + { + "key" : "dr5r", + "doc_count" : 26 + }, + { + "key" : "9q5b", + "doc_count" : 20 + }, + { + "key" : "c20g", + "doc_count" : 19 + }, + { + "key" : "dr70", + "doc_count" : 18 + } + ... + ] + } + } +} +``` + +You can visualize the aggregated response on a map using OpenSearch Dashboards. + +The more accurate you want the aggregation to be, the more resources OpenSearch consumes because of the number of buckets that the aggregation has to calculate. By default, OpenSearch does not generate more than 10,000 buckets. You can change this behavior by using the `size` attribute, but keep in mind that the performance might suffer for very wide queries consisting of thousands of buckets. + +## Aggregating geoshapes + +To run an aggregation on a geoshape field, first create an index and map the `location` field as a `geo_shape`: + +```json +PUT national_parks +{ + "mappings": { + "properties": { + "location": { + "type": "geo_shape" + } + } + } +} +``` +{% include copy-curl.html %} + +Next, index some documents into the `national_parks` index: + +```json +PUT national_parks/_doc/1 +{ + "name": "Yellowstone National Park", + "location": + {"type": "envelope","coordinates": [ [-111.15, 45.12], [-109.83, 44.12] ]} +} +``` +{% include copy-curl.html %} + +```json +PUT national_parks/_doc/2 +{ + "name": "Yosemite National Park", + "location": + {"type": "envelope","coordinates": [ [-120.23, 38.16], [-119.05, 37.45] ]} +} +``` +{% include copy-curl.html %} + +```json +PUT national_parks/_doc/3 +{ + "name": "Death Valley National Park", + "location": + {"type": "envelope","coordinates": [ [-117.34, 37.01], [-116.38, 36.25] ]} +} +``` +{% include copy-curl.html %} + +You can run an aggregation on the `location` field as follows: + +```json +GET national_parks/_search +{ + "aggregations": { + "grouped": { + "geohash_grid": { + "field": "location", + "precision": 1 + } + } + } +} +``` +{% include copy-curl.html %} + +When aggregating geoshapes, one geoshape can be counted for multiple buckets because it overlaps multiple grid cells: + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 24, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "national_parks", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "name" : "Yellowstone National Park", + "location" : { + "type" : "envelope", + "coordinates" : [ + [ + -111.15, + 45.12 + ], + [ + -109.83, + 44.12 + ] + ] + } + } + }, + { + "_index" : "national_parks", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "name" : "Yosemite National Park", + "location" : { + "type" : "envelope", + "coordinates" : [ + [ + -120.23, + 38.16 + ], + [ + -119.05, + 37.45 + ] + ] + } + } + }, + { + "_index" : "national_parks", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "name" : "Death Valley National Park", + "location" : { + "type" : "envelope", + "coordinates" : [ + [ + -117.34, + 37.01 + ], + [ + -116.38, + 36.25 + ] + ] + } + } + } + ] + }, + "aggregations" : { + "grouped" : { + "buckets" : [ + { + "key" : "9", + "doc_count" : 3 + }, + { + "key" : "c", + "doc_count" : 1 + } + ] + } + } +} +``` +
+ +Currently, OpenSearch supports geoshape aggregation through the API but not in OpenSearch Dashboards visualizations. If you'd like to see geoshape aggregation implemented for visualizations, upvote the related [GitHub issue](https://github.com/opensearch-project/dashboards-maps/issues/250). +{: .note} + +## Supported parameters + +Geohash grid aggregation requests support the following parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +field | String | The field on which aggregation is performed. This field must be mapped as a `geo_point` or `geo_shape` field. If the field contains an array, all array values are aggregated. Required. +precision | Integer | The granularity level used to determine grid cells for bucketing results. Cells cannot exceed the specified size (diagonal) of the required precision. Valid values are in the [0, 12] range. Optional. Default is 5. +bounds | Object | The bounding box for filtering geopoints and geoshapes. The bounding box is defined by the upper-left and lower-right vertices. Only shapes that intersect with this bounding box or are completely enclosed by this bounding box are included in the aggregation output. The vertices are specified as geopoints in one of the following formats:
- An object with a latitude and longitude
- An array in the [`longitude`, `latitude`] format
- A string in the "`latitude`,`longitude`" format
- A geohash
- WKT
See the [geopoint formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats) for formatting examples. Optional. +size | Integer | The maximum number of buckets to return. When there are more buckets than `size`, OpenSearch returns buckets with more documents. Optional. Default is 10,000. +shard_size | Integer | The maximum number of buckets to return from each shard. Optional. Default is max (10, `size` · number of shards), which provides a more accurate count of more highly prioritized buckets. + +## Geohash precision + +The relationship between geohash precision and the approximate grid cell dimensions is described in the following table. + +Precision /
geohash length | Latitude bits | Longitude bits | Latitude error | Longitude error | Cell height | Cell width +:---:|:-------------:|:--------------:|:--------------:|:---------------:|:-----------:|:----------: + 1 | 2 | 3 | ±23 | ±23 | 4992.6 km | 5009.4 km + 2 | 5 | 5 | ±2.8 | ±5.6 | 624.1 km | 1252.3 km + 3 | 7 | 8 | ±0.70 | ±0.70 | 156 km | 156.5 km + 4 | 10 | 10 | ±0.087 | ±0.18 | 19.5 km | 39.1 km + 5 | 12 | 13 | ±0.022 | ±0.022 | 4.9 km | 4.9 km + 6 | 15 | 15 | ±0.0027 | ±0.0055 | 609.4 m | 1.2 km + 7 | 17 | 18 | ±0.00068 | ±0.00068 | 152.5 m | 152.9 m + 8 | 20 | 20 | ±0.00086 | ±0.000172 | 19 m | 38.2 m + 9 | 22 | 23 | ±0.000021 | ±0.000021 | 4.8 m | 4.8 m + 10 | 25 | 25 | ±0.00000268 | ±0.00000536 | 59.5 cm | 1.2 m + 11 | 27 | 28 | ±0.00000067 | ±0.00000067 | 14.9 cm | 14.9 cm + 12 | 30 | 30 | ±0.00000008 | ±0.00000017 | 1.9 cm | 3.7 cm \ No newline at end of file diff --git a/_aggregations/bucket/geohex-grid.md b/_aggregations/bucket/geohex-grid.md new file mode 100644 index 00000000..de110610 --- /dev/null +++ b/_aggregations/bucket/geohex-grid.md @@ -0,0 +1,393 @@ +--- +layout: default +title: Geohex grid +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 85 +redirect_from: + - /aggregations/geohexgrid/ + - /query-dsl/aggregations/geohexgrid/ + - /query-dsl/aggregations/bucket/geohex-grid/ +--- + +# Geohex grid aggregations + +The Hexagonal Hierarchical Geospatial Indexing System (H3) partitions the Earth's areas into identifiable hexagon-shaped cells. + +The H3 grid system works well for proximity applications because it overcomes the limitations of Geohash's non-uniform partitions. Geohash encodes latitude and longitude pairs, leading to significantly smaller partitions near the poles and a degree of longitude near the equator. However, the H3 grid system's distortions are low and limited to 5 partitions of 122. These five partitions are placed in low-use areas (for example, in the middle of the ocean), leaving the essential areas error free. Thus, grouping documents based on the H3 grid system provides a better aggregation than the Geohash grid. + +The geohex grid aggregation groups [geopoints]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point/) into grid cells for geographical analysis. Each grid cell corresponds to an [H3 cell](https://h3geo.org/docs/core-library/h3Indexing/#h3-cell-indexp) and is identified using the [H3Index representation](https://h3geo.org/docs/core-library/h3Indexing/#h3index-representation). + +## Precision + +The `precision` parameter controls the level of granularity that determines the grid cell size. The lower the precision, the larger the grid cells. + +The following example illustrates low-precision and high-precision aggregation requests. + +To start, create an index and map the `location` field as a `geo_point`: + +```json +PUT national_parks +{ + "mappings": { + "properties": { + "location": { + "type": "geo_point" + } + } + } +} +``` +{% include copy-curl.html %} + +Index the following documents into the sample index: + +```json +PUT national_parks/_doc/1 +{ + "name": "Yellowstone National Park", + "location": "44.42, -110.59" +} +``` +{% include copy-curl.html %} + +```json +PUT national_parks/_doc/2 +{ + "name": "Yosemite National Park", + "location": "37.87, -119.53" +} +``` +{% include copy-curl.html %} + +```json +PUT national_parks/_doc/3 +{ + "name": "Death Valley National Park", + "location": "36.53, -116.93" +} +``` +{% include copy-curl.html %} + +You can index geopoints in several formats. For a list of all supported formats, see the [geopoint documentation]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats). +{: .note} + +## Low-precision requests + +Run a low-precision request that buckets all three documents together: + +```json +GET national_parks/_search +{ + "aggregations": { + "grouped": { + "geohex_grid": { + "field": "location", + "precision": 1 + } + } + } +} +``` +{% include copy-curl.html %} + +You can use either the `GET` or `POST` HTTP method for geohex grid aggregation queries. +{: .note} + +The response groups documents 2 and 3 together because they are close enough to be bucketed in one grid cell: + +```json +{ + "took" : 4, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "national_parks", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "name" : "Yellowstone National Park", + "location" : "44.42, -110.59" + } + }, + { + "_index" : "national_parks", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "name" : "Yosemite National Park", + "location" : "37.87, -119.53" + } + }, + { + "_index" : "national_parks", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "name" : "Death Valley National Park", + "location" : "36.53, -116.93" + } + } + ] + }, + "aggregations" : { + "grouped" : { + "buckets" : [ + { + "key" : "8129bffffffffff", + "doc_count" : 2 + }, + { + "key" : "8128bffffffffff", + "doc_count" : 1 + } + ] + } + } +} +``` + +## High-precision requests + +Now run a high-precision request: + +```json +GET national_parks/_search +{ + "aggregations": { + "grouped": { + "geohex_grid": { + "field": "location", + "precision": 6 + } + } + } +} +``` +{% include copy-curl.html %} + +All three documents are bucketed separately because of higher granularity: + +```json +{ + "took" : 5, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "national_parks", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "name" : "Yellowstone National Park", + "location" : "44.42, -110.59" + } + }, + { + "_index" : "national_parks", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "name" : "Yosemite National Park", + "location" : "37.87, -119.53" + } + }, + { + "_index" : "national_parks", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "name" : "Death Valley National Park", + "location" : "36.53, -116.93" + } + } + ] + }, + "aggregations" : { + "grouped" : { + "buckets" : [ + { + "key" : "8629ab6dfffffff", + "doc_count" : 1 + }, + { + "key" : "8629857a7ffffff", + "doc_count" : 1 + }, + { + "key" : "862896017ffffff", + "doc_count" : 1 + } + ] + } + } +} +``` + +## Filtering requests + +High-precision requests are resource intensive, so we recommend using a filter like `geo_bounding_box` to limit the geographical area. For example, the following query applies a filter to limit the search area: + +```json +GET national_parks/_search +{ + "size" : 0, + "aggregations": { + "filtered": { + "filter": { + "geo_bounding_box": { + "location": { + "top_left": "38, -120", + "bottom_right": "36, -116" + } + } + }, + "aggregations": { + "grouped": { + "geohex_grid": { + "field": "location", + "precision": 6 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the two documents that are within the `geo_bounding_box` bounds: + +```json +{ + "took" : 4, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "filtered" : { + "doc_count" : 2, + "grouped" : { + "buckets" : [ + { + "key" : "8629ab6dfffffff", + "doc_count" : 1 + }, + { + "key" : "8629857a7ffffff", + "doc_count" : 1 + } + ] + } + } + } +} +``` + +You can also restrict the geographical area by providing the coordinates of the bounding envelope in the `bounds` parameter. Both `bounds` and `geo_bounding_box` coordinates can be specified in any of the [geopoint formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats). The following query uses the well-known text (WKT) "POINT(`longitude` `latitude`)" format for the `bounds` parameter: + +```json +GET national_parks/_search +{ + "size": 0, + "aggregations": { + "grouped": { + "geohex_grid": { + "field": "location", + "precision": 6, + "bounds": { + "top_left": "POINT (-120 38)", + "bottom_right": "POINT (-116 36)" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains only the two results that are within the specified bounds: + +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "grouped" : { + "buckets" : [ + { + "key" : "8629ab6dfffffff", + "doc_count" : 1 + }, + { + "key" : "8629857a7ffffff", + "doc_count" : 1 + } + ] + } + } +} +``` + +The `bounds` parameter can be used with or without the `geo_bounding_box` filter; these two parameters are independent and can have any spatial relationship to each other. + +## Supported parameters + +Geohex grid aggregation requests support the following parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +field | String | The field that contains the geopoints. This field must be mapped as a `geo_point` field. If the field contains an array, all array values are aggregated. Required. +precision | Integer | The granularity level used to determine grid cells for bucketing results. Cells cannot exceed the specified size (diagonal) of the required precision. Valid values are in the [0, 15] range. Optional. Default is 5. +bounds | Object | The bounding box for filtering geopoints. The bounding box is defined by the upper-left and lower-right vertices. The vertices are specified as geopoints in one of the following formats:
- An object with a latitude and longitude
- An array in the [`longitude`, `latitude`] format
- A string in the "`latitude`,`longitude`" format
- A geohash
- WKT
See the [geopoint formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats) for formatting examples. Optional. +size | Integer | The maximum number of buckets to return. When there are more buckets than `size`, OpenSearch returns buckets with more documents. Optional. Default is 10,000. +shard_size | Integer | The maximum number of buckets to return from each shard. Optional. Default is max (10, `size` · number of shards), which provides a more accurate count of more highly prioritized buckets. \ No newline at end of file diff --git a/_aggregations/bucket/geotile-grid.md b/_aggregations/bucket/geotile-grid.md new file mode 100644 index 00000000..dd0c4f8a --- /dev/null +++ b/_aggregations/bucket/geotile-grid.md @@ -0,0 +1,550 @@ +--- +layout: default +title: Geotile grid +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 87 +redirect_from: + - /query-dsl/aggregations/bucket/geotile-grid/ +--- + +# Geotile grid aggregations + +The geotile grid aggregation groups documents into grid cells for geographical analysis. Each grid cell corresponds to a [map tile](https://en.wikipedia.org/wiki/Tiled_web_map) and is identified using the `{zoom}/{x}/{y}` format. You can aggregate documents on [geopoint]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point/) or [geoshape]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-shape/) fields using a geotile grid aggregation. One notable difference is that a geopoint is only present in one bucket, but a geoshape is counted in all geotile grid cells with which it intersects. + +## Precision + +The `precision` parameter controls the level of granularity that determines the grid cell size. The lower the precision, the larger the grid cells. + +The following example illustrates low-precision and high-precision aggregation requests. + +To start, create an index and map the `location` field as a `geo_point`: + +```json +PUT national_parks +{ + "mappings": { + "properties": { + "location": { + "type": "geo_point" + } + } + } +} +``` +{% include copy-curl.html %} + +Index the following documents into the sample index: + +```json +PUT national_parks/_doc/1 +{ + "name": "Yellowstone National Park", + "location": "44.42, -110.59" +} +``` +{% include copy-curl.html %} + +```json +PUT national_parks/_doc/2 +{ + "name": "Yosemite National Park", + "location": "37.87, -119.53" +} +``` +{% include copy-curl.html %} + +```json +PUT national_parks/_doc/3 +{ + "name": "Death Valley National Park", + "location": "36.53, -116.93" +} +``` +{% include copy-curl.html %} + +You can index geopoints in several formats. For a list of all supported formats, see the [geopoint documentation]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats). +{: .note} + +## Low-precision requests + +Run a low-precision request that buckets all three documents together: + +```json +GET national_parks/_search +{ + "aggregations": { + "grouped": { + "geotile_grid": { + "field": "location", + "precision": 1 + } + } + } +} +``` +{% include copy-curl.html %} + +You can use either the `GET` or `POST` HTTP method for geotile grid aggregation queries. +{: .note} + +The response groups all documents together because they are close enough to be bucketed in one grid cell: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 51, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "national_parks", + "_id": "1", + "_score": 1, + "_source": { + "name": "Yellowstone National Park", + "location": "44.42, -110.59" + } + }, + { + "_index": "national_parks", + "_id": "2", + "_score": 1, + "_source": { + "name": "Yosemite National Park", + "location": "37.87, -119.53" + } + }, + { + "_index": "national_parks", + "_id": "3", + "_score": 1, + "_source": { + "name": "Death Valley National Park", + "location": "36.53, -116.93" + } + } + ] + }, + "aggregations": { + "grouped": { + "buckets": [ + { + "key": "1/0/0", + "doc_count": 3 + } + ] + } + } +} +``` +
+ +## High-precision requests + +Now run a high-precision request: + +```json +GET national_parks/_search +{ + "aggregations": { + "grouped": { + "geotile_grid": { + "field": "location", + "precision": 6 + } + } + } +} +``` +{% include copy-curl.html %} + +All three documents are bucketed separately because of higher granularity: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 15, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "national_parks", + "_id": "1", + "_score": 1, + "_source": { + "name": "Yellowstone National Park", + "location": "44.42, -110.59" + } + }, + { + "_index": "national_parks", + "_id": "2", + "_score": 1, + "_source": { + "name": "Yosemite National Park", + "location": "37.87, -119.53" + } + }, + { + "_index": "national_parks", + "_id": "3", + "_score": 1, + "_source": { + "name": "Death Valley National Park", + "location": "36.53, -116.93" + } + } + ] + }, + "aggregations": { + "grouped": { + "buckets": [ + { + "key": "6/12/23", + "doc_count": 1 + }, + { + "key": "6/11/25", + "doc_count": 1 + }, + { + "key": "6/10/24", + "doc_count": 1 + } + ] + } + } +} +``` +
+ +You can also restrict the geographical area by providing the coordinates of the bounding envelope in the `bounds` parameter. Both `bounds` and `geo_bounding_box` coordinates can be specified in any of the [geopoint formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats). The following query uses the well-known text (WKT) "POINT(`longitude` `latitude`)" format for the `bounds` parameter: + +```json +GET national_parks/_search +{ + "size": 0, + "aggregations": { + "grouped": { + "geotile_grid": { + "field": "location", + "precision": 6, + "bounds": { + "top_left": "POINT (-120 38)", + "bottom_right": "POINT (-116 36)" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains only the two results that are within the specified bounds: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 48, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "national_parks", + "_id": "1", + "_score": 1, + "_source": { + "name": "Yellowstone National Park", + "location": "44.42, -110.59" + } + }, + { + "_index": "national_parks", + "_id": "2", + "_score": 1, + "_source": { + "name": "Yosemite National Park", + "location": "37.87, -119.53" + } + }, + { + "_index": "national_parks", + "_id": "3", + "_score": 1, + "_source": { + "name": "Death Valley National Park", + "location": "36.53, -116.93" + } + } + ] + }, + "aggregations": { + "grouped": { + "buckets": [ + { + "key": "6/11/25", + "doc_count": 1 + }, + { + "key": "6/10/24", + "doc_count": 1 + } + ] + } + } +} +``` +
+ +The `bounds` parameter can be used with or without the `geo_bounding_box` filter; these two parameters are independent and can have any spatial relationship to each other. + +## Aggregating geoshapes + +To run an aggregation on a geoshape field, first create an index and map the `location` field as a `geo_shape`: + +```json +PUT national_parks +{ + "mappings": { + "properties": { + "location": { + "type": "geo_shape" + } + } + } +} +``` +{% include copy-curl.html %} + +Next, index some documents into the `national_parks` index: + +```json +PUT national_parks/_doc/1 +{ + "name": "Yellowstone National Park", + "location": + {"type": "envelope","coordinates": [ [-111.15, 45.12], [-109.83, 44.12] ]} +} +``` +{% include copy-curl.html %} + +```json +PUT national_parks/_doc/2 +{ + "name": "Yosemite National Park", + "location": + {"type": "envelope","coordinates": [ [-120.23, 38.16], [-119.05, 37.45] ]} +} +``` +{% include copy-curl.html %} + +```json +PUT national_parks/_doc/3 +{ + "name": "Death Valley National Park", + "location": + {"type": "envelope","coordinates": [ [-117.34, 37.01], [-116.38, 36.25] ]} +} +``` +{% include copy-curl.html %} + +You can run an aggregation on the `location` field as follows: + +```json +GET national_parks/_search +{ + "aggregations": { + "grouped": { + "geotile_grid": { + "field": "location", + "precision": 6 + } + } + } +} +``` +{% include copy-curl.html %} + +When aggregating geoshapes, one geoshape can be counted for multiple buckets because it overlaps with multiple grid cells: + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "national_parks", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "name" : "Yellowstone National Park", + "location" : { + "type" : "envelope", + "coordinates" : [ + [ + -111.15, + 45.12 + ], + [ + -109.83, + 44.12 + ] + ] + } + } + }, + { + "_index" : "national_parks", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "name" : "Yosemite National Park", + "location" : { + "type" : "envelope", + "coordinates" : [ + [ + -120.23, + 38.16 + ], + [ + -119.05, + 37.45 + ] + ] + } + } + }, + { + "_index" : "national_parks", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "name" : "Death Valley National Park", + "location" : { + "type" : "envelope", + "coordinates" : [ + [ + -117.34, + 37.01 + ], + [ + -116.38, + 36.25 + ] + ] + } + } + } + ] + }, + "aggregations" : { + "grouped" : { + "buckets" : [ + { + "key" : "6/12/23", + "doc_count" : 1 + }, + { + "key" : "6/12/22", + "doc_count" : 1 + }, + { + "key" : "6/11/25", + "doc_count" : 1 + }, + { + "key" : "6/11/24", + "doc_count" : 1 + }, + { + "key" : "6/10/24", + "doc_count" : 1 + } + ] + } + } +} +``` +
+ +Currently, OpenSearch supports geoshape aggregation through the API but not in OpenSearch Dashboards visualizations. If you'd like to see geoshape aggregation implemented for visualizations, upvote the related [GitHub issue](https://github.com/opensearch-project/dashboards-maps/issues/250). +{: .note} + +## Supported parameters + +Geotile grid aggregation requests support the following parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +field | String | The field that contains the geopoints. This field must be mapped as a `geo_point` field. If the field contains an array, all array values are aggregated. Required. +precision | Integer | The granularity level used to determine grid cells for bucketing results. Cells cannot exceed the specified size (diagonal) of the required precision. Valid values are in the [0, 29] range. Optional. Default is 7. +bounds | Object | The bounding box for filtering geopoints. The bounding box is defined by the upper-left and lower-right vertices. The vertices are specified as geopoints in one of the following formats:
- An object with a latitude and longitude
- An array in the [`longitude`, `latitude`] format
- A string in the "`latitude`,`longitude`" format
- A geohash
- WKT
See the [geopoint formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats) for formatting examples. Optional. +size | Integer | The maximum number of buckets to return. When there are more buckets than `size`, OpenSearch returns buckets with more documents. Optional. Default is 10,000. +shard_size | Integer | The maximum number of buckets to return from each shard. Optional. Default is max (10, `size` · number of shards), which provides a more accurate count of more highly prioritized buckets. \ No newline at end of file diff --git a/_aggregations/bucket/global.md b/_aggregations/bucket/global.md new file mode 100644 index 00000000..bfd516b8 --- /dev/null +++ b/_aggregations/bucket/global.md @@ -0,0 +1,59 @@ +--- +layout: default +title: Global +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 90 +redirect_from: + - /query-dsl/aggregations/bucket/global/ +--- + +# Global aggregations + +The `global` aggregations lets you break out of the aggregation context of a filter aggregation. Even if you have included a filter query that narrows down a set of documents, the `global` aggregation aggregates on all documents as if the filter query wasn't there. It ignores the `filter` aggregation and implicitly assumes the `match_all` query. + +The following example returns the `avg` value of the `taxful_total_price` field from all documents in the index: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "query": { + "range": { + "taxful_total_price": { + "lte": 50 + } + } + }, + "aggs": { + "total_avg_amount": { + "global": {}, + "aggs": { + "avg_price": { + "avg": { + "field": "taxful_total_price" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "total_avg_amount" : { + "doc_count" : 4675, + "avg_price" : { + "value" : 75.05542864304813 + } + } + } +} +``` + +You can see that the average value for the `taxful_total_price` field is 75.05 and not the 38.36 as seen in the `filter` example when the query matched. \ No newline at end of file diff --git a/_aggregations/bucket/histogram.md b/_aggregations/bucket/histogram.md new file mode 100644 index 00000000..0d9f2bb9 --- /dev/null +++ b/_aggregations/bucket/histogram.md @@ -0,0 +1,54 @@ +--- +layout: default +title: Histogram +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 100 +redirect_from: + - /query-dsl/aggregations/bucket/histogram/ +--- + +# Histogram aggregations + +The `histogram` aggregation buckets documents based on a specified interval. + +With `histogram` aggregations, you can visualize the distributions of values in a given range of documents very easily. Now OpenSearch doesn’t give you back an actual graph of course, that’s what OpenSearch Dashboards is for. But it'll give you the JSON response that you can use to construct your own graph. + +The following example buckets the `number_of_bytes` field by 10,000 intervals: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "number_of_bytes": { + "histogram": { + "field": "bytes", + "interval": 10000 + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "number_of_bytes" : { + "buckets" : [ + { + "key" : 0.0, + "doc_count" : 13372 + }, + { + "key" : 10000.0, + "doc_count" : 702 + } + ] + } + } +} +``` diff --git a/_aggregations/bucket/index.md b/_aggregations/bucket/index.md new file mode 100644 index 00000000..1658c06e --- /dev/null +++ b/_aggregations/bucket/index.md @@ -0,0 +1,45 @@ +--- +layout: default +title: Bucket aggregations +has_children: true +has_toc: false +nav_order: 3 +redirect_from: + - /opensearch/bucket-agg/ + - /query-dsl/aggregations/bucket-agg/ + - /query-dsl/aggregations/bucket/ + - /aggregations/bucket-agg/ +--- + +# Bucket aggregations + +Bucket aggregations categorize sets of documents as buckets. The type of bucket aggregation determines the bucket for a given document. + +You can use bucket aggregations to implement faceted navigation (usually placed as a sidebar on a search result landing page) to help your users filter the results. + +## Supported bucket aggregations + +OpenSearch supports the following bucket aggregations: + +- [Adjacency matrix]({{site.url}}{{site.baseurl}}/aggregations/bucket/adjacency-matrix/) +- [Date histogram]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-histogram/) +- [Date range]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-range/) +- [Diversified sampler]({{site.url}}{{site.baseurl}}/aggregations/bucket/diversified-sampler/) +- [Filter]({{site.url}}{{site.baseurl}}/aggregations/bucket/filter/) +- [Filters]({{site.url}}{{site.baseurl}}/aggregations/bucket/filters/) +- [Geodistance]({{site.url}}{{site.baseurl}}/aggregations/bucket/geo-distance/) +- [Geohash grid]({{site.url}}{{site.baseurl}}/aggregations/bucket/geohash-grid/) +- [Geohex grid]({{site.url}}{{site.baseurl}}/aggregations/bucket/geohex-grid/) +- [Geotile grid]({{site.url}}{{site.baseurl}}/aggregations/bucket/geotile-grid/) +- [Global]({{site.url}}{{site.baseurl}}/aggregations/bucket/global/) +- [Histogram]({{site.url}}{{site.baseurl}}/aggregations/bucket/histogram/) +- [IP range]({{site.url}}{{site.baseurl}}/aggregations/bucket/ip-range/) +- [Missing]({{site.url}}{{site.baseurl}}/aggregations/bucket/missing/) +- [Multi-terms]({{site.url}}{{site.baseurl}}/aggregations/bucket/multi-terms/) +- [Nested]({{site.url}}{{site.baseurl}}/aggregations/bucket/nested/) +- [Range]({{site.url}}{{site.baseurl}}/aggregations/bucket/range/) +- [Reverse nested]({{site.url}}{{site.baseurl}}/aggregations/bucket/reverse-nested/) +- [Sampler]({{site.url}}{{site.baseurl}}/aggregations/bucket/sampler/) +- [Significant terms]({{site.url}}{{site.baseurl}}/aggregations/bucket/significant-terms/) +- [Significant text]({{site.url}}{{site.baseurl}}/aggregations/bucket/significant-text/) +- [Terms]({{site.url}}{{site.baseurl}}/aggregations/bucket/terms/) \ No newline at end of file diff --git a/_aggregations/bucket/ip-range.md b/_aggregations/bucket/ip-range.md new file mode 100644 index 00000000..897827d4 --- /dev/null +++ b/_aggregations/bucket/ip-range.md @@ -0,0 +1,77 @@ +--- +layout: default +title: IP range +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 110 +redirect_from: + - /query-dsl/aggregations/bucket/ip-range/ +--- + +# IP range aggregations + +The `ip_range` aggregation is for IP addresses. +It works on `ip` type fields. You can define the IP ranges and masks in the [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing) notation. + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "access": { + "ip_range": { + "field": "ip", + "ranges": [ + { + "from": "1.0.0.0", + "to": "126.158.155.183" + }, + { + "mask": "1.0.0.0/8" + } + ] + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "access" : { + "buckets" : [ + { + "key" : "1.0.0.0/8", + "from" : "1.0.0.0", + "to" : "2.0.0.0", + "doc_count" : 98 + }, + { + "key" : "1.0.0.0-126.158.155.183", + "from" : "1.0.0.0", + "to" : "126.158.155.183", + "doc_count" : 7184 + } + ] + } + } +} +``` + +If you add a document with malformed fields to an index that has `ip_range` set to `false` in its mappings, OpenSearch rejects the entire document. You can set `ignore_malformed` to `true` to specify that OpenSearch should ignore malformed fields. The default is `false`. + +```json +... +"mappings": { + "properties": { + "ips": { + "type": "ip_range", + "ignore_malformed": true + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/bucket/missing.md b/_aggregations/bucket/missing.md new file mode 100644 index 00000000..54707685 --- /dev/null +++ b/_aggregations/bucket/missing.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Missing +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 120 +redirect_from: + - /query-dsl/aggregations/bucket/missing/ +--- + +# Missing aggregations + +If you have documents in your index that don’t contain the aggregating field at all or the aggregating field has a value of NULL, use the `missing` parameter to specify the name of the bucket such documents should be placed in. + +The following example adds any missing values to a bucket named "N/A": + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "response_codes": { + "terms": { + "field": "response.keyword", + "size": 10, + "missing": "N/A" + } + } + } +} +``` +{% include copy-curl.html %} + +Because the default value for the `min_doc_count` parameter is 1, the `missing` parameter doesn't return any buckets in its response. Set `min_doc_count` parameter to 0 to see the "N/A" bucket in the response: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "response_codes": { + "terms": { + "field": "response.keyword", + "size": 10, + "missing": "N/A", + "min_doc_count": 0 + } + } + } +} +``` + +#### Example response + +```json +... +"aggregations" : { + "response_codes" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "200", + "doc_count" : 12832 + }, + { + "key" : "404", + "doc_count" : 801 + }, + { + "key" : "503", + "doc_count" : 441 + }, + { + "key" : "N/A", + "doc_count" : 0 + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/bucket/multi-terms.md b/_aggregations/bucket/multi-terms.md new file mode 100644 index 00000000..eb779e7c --- /dev/null +++ b/_aggregations/bucket/multi-terms.md @@ -0,0 +1,125 @@ +--- +layout: default +title: Multi-terms +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 130 +redirect_from: + - /query-dsl/aggregations/multi-terms/ +--- + +# Multi-terms aggregations + +Similar to the `terms` bucket aggregation, you can also search for multiple terms using the `multi_terms` aggregation. Multi-terms aggregations are useful when you need to sort by document count, or when you need to sort by a metric aggregation on a composite key and get the top `n` results. For example, you could search for a specific number of documents (e.g., 1000) and the number of servers per location that show CPU usage greater than 90%. The top number of results would be returned for this multi-term query. + +The `multi_terms` aggregation does consume more memory than a `terms` aggregation, so its performance might be slower. +{: .tip } + +## Multi-terms aggregation parameters + +Parameter | Description +:--- | :--- +multi_terms | Indicates a multi-terms aggregation that gathers buckets of documents together based on criteria specified by multiple terms. +size | Specifies the number of buckets to return. Default is 10. +order | Indicates the order to sort the buckets. By default, buckets are ordered according to document count per bucket. If the buckets contain the same document count, then `order` can be explicitly set to the term value instead of document count. (e.g., set `order` to "max-cpu"). +doc_count | Specifies the number of documents to be returned in each bucket. By default, the top 10 terms are returned. + +#### Example request + +```json +GET sample-index100/_search +{ + "size": 0, + "aggs": { + "hot": { + "multi_terms": { + "terms": [{ + "field": "region" + },{ + "field": "host" + }], + "order": {"max-cpu": "desc"} + }, + "aggs": { + "max-cpu": { "max": { "field": "cpu" } } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 118, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 8, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "multi-terms": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": [ + "dub", + "h1" + ], + "key_as_string": "dub|h1", + "doc_count": 2, + "max-cpu": { + "value": 90.0 + } + }, + { + "key": [ + "dub", + "h2" + ], + "key_as_string": "dub|h2", + "doc_count": 2, + "max-cpu": { + "value": 70.0 + } + }, + { + "key": [ + "iad", + "h2" + ], + "key_as_string": "iad|h2", + "doc_count": 2, + "max-cpu": { + "value": 50.0 + } + }, + { + "key": [ + "iad", + "h1" + ], + "key_as_string": "iad|h1", + "doc_count": 2, + "max-cpu": { + "value": 15.0 + } + } + ] + } + } +} +``` diff --git a/_aggregations/bucket/nested.md b/_aggregations/bucket/nested.md new file mode 100644 index 00000000..94a0f441 --- /dev/null +++ b/_aggregations/bucket/nested.md @@ -0,0 +1,106 @@ +--- +layout: default +title: Nested +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 140 +redirect_from: + - /query-dsl/aggregations/bucket/nested/ +--- + +# Nested aggregations + +The `nested` aggregation lets you aggregate on fields inside a nested object. The `nested` type is a specialized version of the object data type that allows arrays of objects to be indexed in a way that they can be queried independently of each other + +With the `object` type, all the data is stored in the same document, so matches for a search can go across sub documents. For example, imagine a `logs` index with `pages` mapped as an `object` datatype: + +```json +PUT logs/_doc/0 +{ + "response": "200", + "pages": [ + { + "page": "landing", + "load_time": 200 + }, + { + "page": "blog", + "load_time": 500 + } + ] +} +``` +{% include copy-curl.html %} + +OpenSearch merges all sub-properties of the entity relations that looks something like this: + +```json +{ + "logs": { + "pages": ["landing", "blog"], + "load_time": ["200", "500"] + } +} +``` + +So, if you wanted to search this index with `pages=landing` and `load_time=500`, this document matches the criteria even though the `load_time` value for landing is 200. + +If you want to make sure such cross-object matches don’t happen, map the field as a `nested` type: + +```json +PUT logs +{ + "mappings": { + "properties": { + "pages": { + "type": "nested", + "properties": { + "page": { "type": "text" }, + "load_time": { "type": "double" } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Nested documents allow you to index the same JSON document but will keep your pages in separate Lucene documents, making only searches like `pages=landing` and `load_time=200` return the expected result. Internally, nested objects index each object in the array as a separate hidden document, meaning that each nested object can be queried independently of the others. + +You have to specify a nested path relative to parent that contains the nested documents: + + +```json +GET logs/_search +{ + "query": { + "match": { "response": "200" } + }, + "aggs": { + "pages": { + "nested": { + "path": "pages" + }, + "aggs": { + "min_load_time": { "min": { "field": "pages.load_time" } } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "pages" : { + "doc_count" : 2, + "min_price" : { + "value" : 200.0 + } + } + } +} +``` diff --git a/_aggregations/bucket/range.md b/_aggregations/bucket/range.md new file mode 100644 index 00000000..61ec2f62 --- /dev/null +++ b/_aggregations/bucket/range.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Range +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 150 +redirect_from: + - /query-dsl/aggregations/bucket/range/ +--- + +# Range aggregations + +The `range` aggregation lets you define the range for each bucket. + +For example, you can find the number of bytes between 1000 and 2000, 2000 and 3000, and 3000 and 4000. +Within the `range` parameter, you can define ranges as objects of an array. + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "number_of_bytes_distribution": { + "range": { + "field": "bytes", + "ranges": [ + { + "from": 1000, + "to": 2000 + }, + { + "from": 2000, + "to": 3000 + }, + { + "from": 3000, + "to": 4000 + } + ] + } + } + } +} +``` +{% include copy-curl.html %} + +The response includes the `from` key values and excludes the `to` key values: + +#### Example response + +```json +... +"aggregations" : { + "number_of_bytes_distribution" : { + "buckets" : [ + { + "key" : "1000.0-2000.0", + "from" : 1000.0, + "to" : 2000.0, + "doc_count" : 805 + }, + { + "key" : "2000.0-3000.0", + "from" : 2000.0, + "to" : 3000.0, + "doc_count" : 1369 + }, + { + "key" : "3000.0-4000.0", + "from" : 3000.0, + "to" : 4000.0, + "doc_count" : 1422 + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/bucket/reverse-nested.md b/_aggregations/bucket/reverse-nested.md new file mode 100644 index 00000000..bfd04986 --- /dev/null +++ b/_aggregations/bucket/reverse-nested.md @@ -0,0 +1,92 @@ +--- +layout: default +title: Reverse nested +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 160 +redirect_from: + - /query-dsl/aggregations/bucket/reverse-nested/ +--- + +# Reverse nested aggregations + +You can aggregate values from nested documents to their parent; this aggregation is called `reverse_nested`. +You can use `reverse_nested` to aggregate a field from the parent document after grouping by the field from the nested object. The `reverse_nested` aggregation "joins back" the root page and gets the `load_time` for each for your variations. + +The `reverse_nested` aggregation is a sub-aggregation inside a nested aggregation. It accepts a single option named `path`. This option defines how many steps backwards in the document hierarchy OpenSearch takes to calculate the aggregations. + +```json +GET logs/_search +{ + "query": { + "match": { "response": "200" } + }, + "aggs": { + "pages": { + "nested": { + "path": "pages" + }, + "aggs": { + "top_pages_per_load_time": { + "terms": { + "field": "pages.load_time" + }, + "aggs": { + "comment_to_logs": { + "reverse_nested": {}, + "aggs": { + "min_load_time": { + "min": { + "field": "pages.load_time" + } + } + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "pages" : { + "doc_count" : 2, + "top_pages_per_load_time" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : 200.0, + "doc_count" : 1, + "comment_to_logs" : { + "doc_count" : 1, + "min_load_time" : { + "value" : null + } + } + }, + { + "key" : 500.0, + "doc_count" : 1, + "comment_to_logs" : { + "doc_count" : 1, + "min_load_time" : { + "value" : null + } + } + } + ] + } + } + } +} +``` + +The response shows the logs index has one page with a `load_time` of 200 and one with a `load_time` of 500. \ No newline at end of file diff --git a/_aggregations/bucket/sampler.md b/_aggregations/bucket/sampler.md new file mode 100644 index 00000000..28bae47b --- /dev/null +++ b/_aggregations/bucket/sampler.md @@ -0,0 +1,84 @@ +--- +layout: default +title: Sampler +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 170 +--- + +# Sampler aggregations + +If you're aggregating a very large number of documents, you can use a `sampler` aggregation to reduce the scope to a small sample of documents, resulting in a faster response. The `sampler` aggregation selects the samples by top-scoring documents. + +The results are approximate but closely represent the distribution of the real data. The `sampler` aggregation significantly improves query performance, but the estimated responses are not entirely reliable. + +The basic syntax is: + +```json +“aggs”: { + "SAMPLE": { + "sampler": { + "shard_size": 100 + }, + "aggs": {...} + } +} +``` + +## Shard size property + +The `shard_size` property tells OpenSearch how many documents (at most) to collect from each shard. + +The following example limits the number of documents collected on each shard to 1,000 and then buckets the documents by a `terms` aggregation: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sample": { + "sampler": { + "shard_size": 1000 + }, + "aggs": { + "terms": { + "terms": { + "field": "agent.keyword" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "sample" : { + "doc_count" : 1000, + "terms" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1", + "doc_count" : 368 + }, + { + "key" : "Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.50 Safari/534.24", + "doc_count" : 329 + }, + { + "key" : "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)", + "doc_count" : 303 + } + ] + } + } + } +} +``` diff --git a/_aggregations/bucket/significant-terms.md b/_aggregations/bucket/significant-terms.md new file mode 100644 index 00000000..017e3b7d --- /dev/null +++ b/_aggregations/bucket/significant-terms.md @@ -0,0 +1,70 @@ +--- +layout: default +title: Significant terms +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 180 +--- + +# Significant terms aggregations + +The `significant_terms` aggregation lets you spot unusual or interesting term occurrences in a filtered subset relative to the rest of the data in an index. + +A foreground set is the set of documents that you filter. A background set is a set of all documents in an index. +The `significant_terms` aggregation examines all documents in the foreground set and finds a score for significant occurrences in contrast to the documents in the background set. + +In the sample web log data, each document has a field containing the `user-agent` of the visitor. This example searches for all requests from an iOS operating system. A regular `terms` aggregation on this foreground set returns Firefox because it has the most number of documents within this bucket. On the other hand, a `significant_terms` aggregation returns Internet Explorer (IE) because IE has a significantly higher appearance in the foreground set as compared to the background set. + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "query": { + "terms": { + "machine.os.keyword": [ + "ios" + ] + } + }, + "aggs": { + "significant_response_codes": { + "significant_terms": { + "field": "agent.keyword" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "significant_response_codes" : { + "doc_count" : 2737, + "bg_count" : 14074, + "buckets" : [ + { + "key" : "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)", + "doc_count" : 818, + "score" : 0.01462731514608217, + "bg_count" : 4010 + }, + { + "key" : "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1", + "doc_count" : 1067, + "score" : 0.009062566630410223, + "bg_count" : 5362 + } + ] + } + } +} +``` + +If the `significant_terms` aggregation doesn't return any result, you might have not filtered the results with a query. Alternatively, the distribution of terms in the foreground set might be the same as the background set, implying that there isn't anything unusual in the foreground set. + +The default source of statistical information for background term frequencies is the entire index. You can narrow this scope with a background filter for more focus + diff --git a/_aggregations/bucket/significant-text.md b/_aggregations/bucket/significant-text.md new file mode 100644 index 00000000..1c136603 --- /dev/null +++ b/_aggregations/bucket/significant-text.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Significant text +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 190 +--- + +# Significant text aggregations + +The `significant_text` aggregation is similar to the `significant_terms` aggregation but it's for raw text fields. +Significant text measures the change in popularity measured between the foreground and background sets using statistical analysis. For example, it might suggest Tesla when you look for its stock acronym TSLA. + +The `significant_text` aggregation re-analyzes the source text on the fly, filtering noisy data like duplicate paragraphs, boilerplate headers and footers, and so on, which might otherwise skew the results. + +Re-analyzing high-cardinality datasets can be a very CPU-intensive operation. We recommend using the `significant_text` aggregation inside a sampler aggregation to limit the analysis to a small selection of top-matching documents, for example 200. + +You can set the following parameters: + +- `min_doc_count` - Return results that match more than a configured number of top hits. We recommend not setting `min_doc_count` to 1 because it tends to return terms that are typos or misspellings. Finding more than one instance of a term helps reinforce that the significance is not the result of a one-off accident. The default value of 3 is used to provide a minimum weight-of-evidence. +- `shard_size` - Setting a high value increases stability (and accuracy) at the expense of computational performance. +- `shard_min_doc_count` - If your text contains many low frequency words and you're not interested in these (for example typos), then you can set the `shard_min_doc_count` parameter to filter out candidate terms at a shard level with a reasonable certainty to not reach the required `min_doc_count` even after merging the local significant text frequencies. The default value is 1, which has no impact until you explicitly set it. We recommend setting this value much lower than the `min_doc_count` value. + +Assume that you have the complete works of Shakespeare indexed in an OpenSearch cluster. You can find significant texts in relation to the word "breathe" in the `text_entry` field: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": "breathe" + } + }, + "aggregations": { + "my_sample": { + "sampler": { + "shard_size": 100 + }, + "aggregations": { + "keywords": { + "significant_text": { + "field": "text_entry", + "min_doc_count": 4 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +"aggregations" : { + "my_sample" : { + "doc_count" : 59, + "keywords" : { + "doc_count" : 59, + "bg_count" : 111396, + "buckets" : [ + { + "key" : "breathe", + "doc_count" : 59, + "score" : 1887.0677966101694, + "bg_count" : 59 + }, + { + "key" : "air", + "doc_count" : 4, + "score" : 2.641295376716233, + "bg_count" : 189 + }, + { + "key" : "dead", + "doc_count" : 4, + "score" : 0.9665839666414213, + "bg_count" : 495 + }, + { + "key" : "life", + "doc_count" : 5, + "score" : 0.9090787433467572, + "bg_count" : 805 + } + ] + } + } + } +} +``` + +The most significant texts in relation to `breathe` are `air`, `dead`, and `life`. + +The `significant_text` aggregation has the following limitations: + +- Doesn't support child aggregations because child aggregations come at a high memory cost. As a workaround, you can add a follow-up query using a `terms` aggregation with an include clause and a child aggregation. +- Doesn't support nested objects because it works with the document JSON source. +- The counts of documents might have some (typically small) inaccuracies as it's based on summing the samples returned from each shard. You can use the `shard_size` parameter to fine-tune the trade-off between accuracy and performance. By default, the `shard_size` is set to -1 to automatically estimate the number of shards and the `size` parameter. + +The default source of statistical information for background term frequencies is the entire index. You can narrow this scope with a background filter for more focus: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": "breathe" + } + }, + "aggregations": { + "my_sample": { + "sampler": { + "shard_size": 100 + }, + "aggregations": { + "keywords": { + "significant_text": { + "field": "text_entry", + "background_filter": { + "term": { + "speaker": "JOHN OF GAUNT" + } + } + } + } + } + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/bucket/terms.md b/_aggregations/bucket/terms.md new file mode 100644 index 00000000..072ad42c --- /dev/null +++ b/_aggregations/bucket/terms.md @@ -0,0 +1,191 @@ +--- +layout: default +title: Terms +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 200 +--- + +# Terms aggregations + +The `terms` aggregation dynamically creates a bucket for each unique term of a field. + +The following example uses the `terms` aggregation to find the number of documents per response code in web log data: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "response_codes": { + "terms": { + "field": "response.keyword", + "size": 10 + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "response_codes" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "200", + "doc_count" : 12832 + }, + { + "key" : "404", + "doc_count" : 801 + }, + { + "key" : "503", + "doc_count" : 441 + } + ] + } + } +} +``` + +The values are returned with the key `key`. +`doc_count` specifies the number of documents in each bucket. By default, the buckets are sorted in descending order of `doc-count`. + + +## Size and shard size parameters + +The number of buckets returned by the `terms` aggregation is controlled by the `size` parameter, which is 10 by default. + +Additionally, the coordinating node responsible for the aggregation will prompt each shard for its top unique terms. The number of buckets returned by each shard is controlled by the `shard_size` parameter. This parameter is distinct from the `size` parameter and exists as a mechanism to increase the accuracy of the bucket document counts. + +For example, imagine a scenario in which the `size` and `shard_size` parameters both have a value of 3. The `terms` aggregation prompts each shard for its top three unique terms. The coordinating node aggregates the results to compute the final result. If a shard contains an object that is not included in the top three, then it won't show up in the response. However, increasing the `shard_size` value for this request will allow each shard to return a larger number of unique terms, increasing the likelihood that the coordinating node will receive all relevant results. + +By default, the `shard_size` parameter is set to `size * 1.5 + 10`. + +When using concurrent segment search, the `shard_size` parameter is also applied to each segment slice. + +The `shard_size` parameter serves as a way to balance the performance and document count accuracy of the `terms` aggregation. Higher `shard_size` values will ensure higher document count accuracy but will result in higher memory and compute usage. Lower `shard_size` values will be more performant but will result in lower document count accuracy. + +## Document count error + +The response also includes two keys named `doc_count_error_upper_bound` and `sum_other_doc_count`. + +The `terms` aggregation returns the top unique terms. Therefore, if the data contains many unique terms, then some of them might not appear in the results. The `sum_other_doc_count` field represents the sum of the documents that are excluded from the response. In this case, the number is 0 because all of the unique values appear in the response. + +The `doc_count_error_upper_bound` field represents the maximum possible count for a unique value that is excluded from the final results. Use this field to estimate the margin of error for the count. + +The `doc_count_error_upper_bound` value and the concept of accuracy are only applicable to aggregations using the default sort order---by document count, descending. This is because when you sort by descending document count, any terms that were not returned are guaranteed to include equal or fewer documents than those terms that were returned. Based on this, you can compute the `doc_count_error_upper_bound`. + +If the `show_term_doc_count_error` parameter is set to `true`, then the `terms` aggregation will show the `doc_count_error_upper_bound` computed for each unique bucket in addition to the overall value. + +## The `min_doc_count` and `shard_min_doc_count` parameters + +You can use the `min_doc_count` parameter to filter out any unique terms with fewer than `min_doc_count` results. The `min_doc_count` threshold is applied only after merging the results retrieved from all of the shards. Each shard is unaware of the global document count for a given term. If there is a significant difference between the top `shard_size` globally frequent terms and the top terms local to a shard, you may receive unexpected results when using the `min_doc_count` parameter. + +Separately, the `shard_min_doc_count` parameter is used to filter out the unique terms that a shard returns back to the coordinator with fewer than `shard_min_doc_count` results. + +When using concurrent segment search, the `shard_min_doc_count` parameter is not applied to each segment slice. For more information, see the [related GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/11847). + +## Collect mode + +There are two collect modes available: `depth_first` and `breadth_first`. The `depth_first` collection mode expands all branches of the aggregation tree in a depth-first manner and only performs pruning after the expansion is complete. + +However, when using nested `terms` aggregations, the cardinality of the number of buckets returned is multiplied by the cardinality of the field at each level of nesting, making it easy to see combinatorial explosion in the bucket count as you nest aggregations. + +You can use the `breadth_first` collection mode to address this issue. In this case, pruning will be applied to the first level of the aggregation tree before it is expanded to the next level, potentially greatly reducing the number of buckets computed. + +Additionally, there is memory overhead associated with performing `breadth_first` collection, which is linearly related to the number of matching documents. This is because `breadth_first` collection works by caching and replaying the pruned set of buckets from the parent level. + + +## Account for pre-aggregated data + +While the `doc_count` field provides a representation of the number of individual documents aggregated in a bucket, `doc_count` by itself does not have a way to correctly increment documents that store pre-aggregated data. To account for pre-aggregated data and accurately calculate the number of documents in a bucket, you can use the `_doc_count` field to add the number of documents in a single summary field. When a document includes the `_doc_count` field, all bucket aggregations recognize its value and increase the bucket `doc_count` cumulatively. Keep these considerations in mind when using the `_doc_count` field: + +* The field does not support nested arrays; only positive integers can be used. +* If a document does not contain the `_doc_count` field, aggregation uses the document to increase the count by 1. + +OpenSearch features that rely on an accurate document count illustrate the importance of using the `_doc_count` field. To see how this field can be used to support other search tools, refer to [Index rollups](https://opensearch.org/docs/latest/im-plugin/index-rollups/index/), an OpenSearch feature for the Index Management (IM) plugin that stores documents with pre-aggregated data in rollup indexes. +{: .tip} + +#### Example request + +```json +PUT /my_index/_doc/1 +{ + "response_code": 404, + "date":"2022-08-05", + "_doc_count": 20 +} + +PUT /my_index/_doc/2 +{ + "response_code": 404, + "date":"2022-08-06", + "_doc_count": 10 +} + +PUT /my_index/_doc/3 +{ + "response_code": 200, + "date":"2022-08-06", + "_doc_count": 300 +} + +GET /my_index/_search +{ + "size": 0, + "aggs": { + "response_codes": { + "terms": { + "field" : "response_code" + } + } + } +} +``` + +#### Example response + +```json +{ + "took" : 20, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "response_codes" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : 200, + "doc_count" : 300 + }, + { + "key" : 404, + "doc_count" : 30 + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_opensearch/aggregations.md b/_aggregations/index.md similarity index 93% rename from _opensearch/aggregations.md rename to _aggregations/index.md index 69c7cd0b..1c43799a 100644 --- a/_opensearch/aggregations.md +++ b/_aggregations/index.md @@ -1,8 +1,14 @@ --- layout: default title: Aggregations -nav_order: 14 has_children: true +nav_order: 5 +nav_exclude: true +permalink: /aggregations/ +redirect_from: + - /opensearch/aggregations/ + - /query-dsl/aggregations/ + - /aggregations/index/ --- # Aggregations @@ -86,7 +92,7 @@ GET opensearch_dashboards_sample_data_ecommerce/_search } ``` -#### Sample response +#### Example response ```json { @@ -156,3 +162,7 @@ Bucket aggregations produce buckets of documents that you can nest in other aggr The inner `aggs` keyword begins a new nested aggregation. The syntax of the parent aggregation and the nested aggregation is the same. Nested aggregations run in the context of the preceding parent aggregations. You can also pair your aggregations with search queries to narrow down things you’re trying to analyze before aggregating. If you don't add a query, OpenSearch implicitly uses the `match_all` query. + +## Limitations + +Because aggregators are processed using the `double` data type for all values, `long` values of 253 and greater are approximate. diff --git a/_aggregations/metric/average.md b/_aggregations/metric/average.md new file mode 100644 index 00000000..428f1e76 --- /dev/null +++ b/_aggregations/metric/average.md @@ -0,0 +1,58 @@ +--- +layout: default +title: Average +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 10 +redirect_from: + - /query-dsl/aggregations/metric/average/ +--- + +# Average aggregations + +The `avg` metric is a single-value metric aggregations that returns the average value of a field. + +The following example calculates the average of the `taxful_total_price` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "avg_taxful_total_price": { + "avg": { + "field": "taxful_total_price" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 85, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "avg_taxful_total_price": { + "value": 75.05542864304813 + } + } +} +``` diff --git a/_aggregations/metric/cardinality.md b/_aggregations/metric/cardinality.md new file mode 100644 index 00000000..c40dbb44 --- /dev/null +++ b/_aggregations/metric/cardinality.md @@ -0,0 +1,62 @@ +--- +layout: default +title: Cardinality +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 20 +redirect_from: + - /query-dsl/aggregations/metric/cardinality/ +--- + +# Cardinality aggregations + +The `cardinality` metric is a single-value metric aggregation that counts the number of unique or distinct values of a field. + +The following example finds the number of unique products in an eCommerce store: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "unique_products": { + "cardinality": { + "field": "products.product_id" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... + "aggregations" : { + "unique_products" : { + "value" : 7033 + } + } +} +``` + +Cardinality count is approximate. +If you have tens of thousands of products in your hypothetical store, an accurate cardinality calculation requires loading all the values into a hash set and returning its size. This approach doesn't scale well; it requires huge amounts of memory and can cause high latencies. + +You can control the trade-off between memory and accuracy with the `precision_threshold` setting. This setting defines the threshold below which counts are expected to be close to accurate. Above this value, counts might become a bit less accurate. The default value of `precision_threshold` is 3,000. The maximum supported value is 40,000. + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "unique_products": { + "cardinality": { + "field": "products.product_id", + "precision_threshold": 10000 + } + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/metric/extended-stats.md b/_aggregations/metric/extended-stats.md new file mode 100644 index 00000000..633407da --- /dev/null +++ b/_aggregations/metric/extended-stats.md @@ -0,0 +1,77 @@ +--- +layout: default +title: Extended stats +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 30 +redirect_from: + - /query-dsl/aggregations/metric/extended-stats/ +--- + +# Extended stats aggregations + +The `extended_stats` aggregation is an extended version of the [`stats`]({{site.url}}{{site.baseurl}}/query-dsl/aggregations/metric/stats/) aggregation. Apart from including basic stats, `extended_stats` also returns stats such as `sum_of_squares`, `variance`, and `std_deviation`. +The following example returns extended stats for `taxful_total_price`: +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "extended_stats_taxful_total_price": { + "extended_stats": { + "field": "taxful_total_price" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "extended_stats_taxful_total_price" : { + "count" : 4675, + "min" : 6.98828125, + "max" : 2250.0, + "avg" : 75.05542864304813, + "sum" : 350884.12890625, + "sum_of_squares" : 3.9367749294174194E7, + "variance" : 2787.59157113862, + "variance_population" : 2787.59157113862, + "variance_sampling" : 2788.187974983536, + "std_deviation" : 52.79764740155209, + "std_deviation_population" : 52.79764740155209, + "std_deviation_sampling" : 52.80329511482722, + "std_deviation_bounds" : { + "upper" : 180.6507234461523, + "lower" : -30.53986616005605, + "upper_population" : 180.6507234461523, + "lower_population" : -30.53986616005605, + "upper_sampling" : 180.66201887270256, + "lower_sampling" : -30.551161586606312 + } + } + } +} +``` + +The `std_deviation_bounds` object provides a visual variance of the data with an interval of plus/minus two standard deviations from the mean. +To set the standard deviation to a different value, say 3, set `sigma` to 3: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "extended_stats_taxful_total_price": { + "extended_stats": { + "field": "taxful_total_price", + "sigma": 3 + } + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/metric/geobounds.md b/_aggregations/metric/geobounds.md new file mode 100644 index 00000000..27b7646c --- /dev/null +++ b/_aggregations/metric/geobounds.md @@ -0,0 +1,229 @@ +--- +layout: default +title: Geobounds +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 40 +redirect_from: + - /query-dsl/aggregations/metric/geobounds/ +--- + +## Geobounds aggregations + +The `geo_bounds` metric is a multi-value metric aggregation that calculates the [geographic bounding box](https://docs.ogc.org/is/12-063r5/12-063r5.html#30) containing all values of a given `geo_point` or `geo_shape` field. The bounding box is returned as the upper-left and lower-right vertices of the rectangle in terms of latitude and longitude. + +The following example returns the `geo_bounds` metrics for the `geoip.location` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "geo": { + "geo_bounds": { + "field": "geoip.location" + } + } + } +} +``` + +#### Example response + +```json +"aggregations" : { + "geo" : { + "bounds" : { + "top_left" : { + "lat" : 52.49999997206032, + "lon" : -118.20000001229346 + }, + "bottom_right" : { + "lat" : 4.599999985657632, + "lon" : 55.299999956041574 + } + } + } + } +} +``` + +## Aggregating geoshapes + +To run an aggregation on a geoshape field, first create an index and map the `location` field as a `geo_shape`: + +```json +PUT national_parks +{ + "mappings": { + "properties": { + "location": { + "type": "geo_shape" + } + } + } +} +``` +{% include copy-curl.html %} + +Next, index some documents into the `national_parks` index: + +```json +PUT national_parks/_doc/1 +{ + "name": "Yellowstone National Park", + "location": + {"type": "envelope","coordinates": [ [-111.15, 45.12], [-109.83, 44.12] ]} +} +``` +{% include copy-curl.html %} + +```json +PUT national_parks/_doc/2 +{ + "name": "Yosemite National Park", + "location": + {"type": "envelope","coordinates": [ [-120.23, 38.16], [-119.05, 37.45] ]} +} +``` +{% include copy-curl.html %} + +```json +PUT national_parks/_doc/3 +{ + "name": "Death Valley National Park", + "location": + {"type": "envelope","coordinates": [ [-117.34, 37.01], [-116.38, 36.25] ]} +} +``` +{% include copy-curl.html %} + +You can run a `geo_bounds` aggregation on the `location` field as follows: + +```json +GET national_parks/_search +{ + "aggregations": { + "grouped": { + "geo_bounds": { + "field": "location", + "wrap_longitude": true + } + } + } +} +``` +{% include copy-curl.html %} + +The optional `wrap_longitude` parameter specifies whether the bounding box returned by the aggregation can overlap the international date line (180° meridian). If `wrap_longitude` is set to `true`, the bounding box can overlap the international date line and return a `bounds` object in which the lower-left longitude is greater than the upper-right longitude. The default value for `wrap_longitude` is `true`. + +The response contains the geo-bounding box that encloses all shapes in the `location` field: + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "national_parks", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "name" : "Yellowstone National Park", + "location" : { + "type" : "envelope", + "coordinates" : [ + [ + -111.15, + 45.12 + ], + [ + -109.83, + 44.12 + ] + ] + } + } + }, + { + "_index" : "national_parks", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "name" : "Yosemite National Park", + "location" : { + "type" : "envelope", + "coordinates" : [ + [ + -120.23, + 38.16 + ], + [ + -119.05, + 37.45 + ] + ] + } + } + }, + { + "_index" : "national_parks", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "name" : "Death Valley National Park", + "location" : { + "type" : "envelope", + "coordinates" : [ + [ + -117.34, + 37.01 + ], + [ + -116.38, + 36.25 + ] + ] + } + } + } + ] + }, + "aggregations" : { + "Grouped" : { + "bounds" : { + "top_left" : { + "lat" : 45.11999997776002, + "lon" : -120.23000006563962 + }, + "bottom_right" : { + "lat" : 36.249999976716936, + "lon" : -109.83000006526709 + } + } + } + } +} +``` +
+ +Currently, OpenSearch supports geoshape aggregation through the API but not in OpenSearch Dashboards visualizations. If you'd like to see geoshape aggregation implemented for visualizations, upvote the related [GitHub issue](https://github.com/opensearch-project/dashboards-maps/issues/250). +{: .note} diff --git a/_aggregations/metric/index.md b/_aggregations/metric/index.md new file mode 100644 index 00000000..7553933c --- /dev/null +++ b/_aggregations/metric/index.md @@ -0,0 +1,47 @@ +--- +layout: default +title: Metric aggregations +has_children: true +has_toc: false +nav_order: 2 +redirect_from: + - /opensearch/metric-agg/ + - /query-dsl/aggregations/metric-agg/ + - /aggregations/metric-agg/ + - /query-dsl/aggregations/metric/ +--- + +# Metric aggregations + +Metric aggregations let you perform simple calculations such as finding the minimum, maximum, and average values of a field. + +## Types of metric aggregations + +There are two types of metric aggregations: single-value metric aggregations and multi-value metric aggregations. + +### Single-value metric aggregations + +Single-value metric aggregations return a single metric, for example, `sum`, `min`, `max`, `avg`, `cardinality`, or `value_count`. + +### Multi-value metric aggregations + +Multi-value metric aggregations return more than one metric. These include `stats`, `extended_stats`, `matrix_stats`, `percentile`, `percentile_ranks`, `geo_bound`, `top_hits`, and `scripted_metric`. + +## Supported metric aggregations + +OpenSearch supports the following metric aggregations: + +- [Average]({{site.url}}{{site.baseurl}}/aggregations/metric/average/) +- [Cardinality]({{site.url}}{{site.baseurl}}/aggregations/metric/cardinality/) +- [Extended stats]({{site.url}}{{site.baseurl}}/aggregations/metric/extended-stats/) +- [Geobounds]({{site.url}}{{site.baseurl}}/aggregations/metric/geobounds/) +- [Matrix stats]({{site.url}}{{site.baseurl}}/aggregations/metric/matrix-stats/) +- [Maximum]({{site.url}}{{site.baseurl}}/aggregations/metric/maximum/) +- [Minimum]({{site.url}}{{site.baseurl}}/aggregations/metric/minimum/) +- [Percentile ranks]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile-ranks/) +- [Percentile]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/) +- [Scripted metric]({{site.url}}{{site.baseurl}}/aggregations/metric/scripted-metric/) +- [Stats]({{site.url}}{{site.baseurl}}/aggregations/metric/stats/) +- [Sum]({{site.url}}{{site.baseurl}}/aggregations/metric/sum/) +- [Top hits]({{site.url}}{{site.baseurl}}/aggregations/metric/top-hits/) +- [Value count]({{site.url}}{{site.baseurl}}/aggregations/metric/value-count/) \ No newline at end of file diff --git a/_aggregations/metric/matrix-stats.md b/_aggregations/metric/matrix-stats.md new file mode 100644 index 00000000..475e0caa --- /dev/null +++ b/_aggregations/metric/matrix-stats.md @@ -0,0 +1,87 @@ +--- +layout: default +title: Matrix stats +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 50 +redirect_from: + - /query-dsl/aggregations/metric/matrix-stats/ +--- + +# Matrix stats aggregations + +The `matrix_stats` aggregation generates advanced stats for multiple fields in a matrix form. +The following example returns advanced stats in a matrix form for the `taxful_total_price` and `products.base_price` fields: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "matrix_stats_taxful_total_price": { + "matrix_stats": { + "fields": ["taxful_total_price", "products.base_price"] + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "matrix_stats_taxful_total_price" : { + "doc_count" : 4675, + "fields" : [ + { + "name" : "products.base_price", + "count" : 4675, + "mean" : 34.994239430147196, + "variance" : 360.5035285833703, + "skewness" : 5.530161335032702, + "kurtosis" : 131.16306324042148, + "covariance" : { + "products.base_price" : 360.5035285833703, + "taxful_total_price" : 846.6489362233166 + }, + "correlation" : { + "products.base_price" : 1.0, + "taxful_total_price" : 0.8444765264325268 + } + }, + { + "name" : "taxful_total_price", + "count" : 4675, + "mean" : 75.05542864304839, + "variance" : 2788.1879749835402, + "skewness" : 15.812149139924037, + "kurtosis" : 619.1235507385902, + "covariance" : { + "products.base_price" : 846.6489362233166, + "taxful_total_price" : 2788.1879749835402 + }, + "correlation" : { + "products.base_price" : 0.8444765264325268, + "taxful_total_price" : 1.0 + } + } + ] + } + } +} +``` + +The following table lists all response fields. + +Statistic | Description +:--- | :--- +`count` | The number of samples measured. +`mean` | The average value of the field measured from the sample. +`variance` | How far the values of the field measured are spread out from its mean value. The larger the variance, the more it's spread from its mean value. +`skewness` | An asymmetric measure of the distribution of the field's values around the mean. +`kurtosis` | A measure of the tail heaviness of a distribution. As the tail becomes lighter, kurtosis decreases. As the tail becomes heavier, kurtosis increases. To learn about kurtosis, see [Wikipedia](https://en.wikipedia.org/wiki/Kurtosis). +`covariance` | A measure of the joint variability between two fields. A positive value means their values move in the same direction and the other way around. +`correlation` | A measure of the strength of the relationship between two fields. The valid values are between [-1, 1]. A value of -1 means that the value is negatively correlated and a value of 1 means that it's positively correlated. A value of 0 means that there's no identifiable relationship between them. \ No newline at end of file diff --git a/_aggregations/metric/maximum.md b/_aggregations/metric/maximum.md new file mode 100644 index 00000000..63b4d62a --- /dev/null +++ b/_aggregations/metric/maximum.md @@ -0,0 +1,58 @@ +--- +layout: default +title: Maximum +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 60 +redirect_from: + - /query-dsl/aggregations/metric/maximum/ +--- + +# Maximum aggregations + +The `max` metric is a single-value metric aggregations that returns the maximum value of a field. + +The following example calculates the maximum of the `taxful_total_price` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "max_taxful_total_price": { + "max": { + "field": "taxful_total_price" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 17, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "max_taxful_total_price": { + "value": 2250 + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/metric/minimum.md b/_aggregations/metric/minimum.md new file mode 100644 index 00000000..dd17c854 --- /dev/null +++ b/_aggregations/metric/minimum.md @@ -0,0 +1,58 @@ +--- +layout: default +title: Minimum +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 70 +redirect_from: + - /query-dsl/aggregations/metric/minimum/ +--- + +# Minimum aggregations + +The `min` metric is a single-value metric aggregations that returns the minimum value of a field. + +The following example calculates the minimum of the `taxful_total_price` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "min_taxful_total_price": { + "min": { + "field": "taxful_total_price" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 13, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "min_taxful_total_price": { + "value": 6.98828125 + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md new file mode 100644 index 00000000..33ccb3d2 --- /dev/null +++ b/_aggregations/metric/percentile-ranks.md @@ -0,0 +1,47 @@ +--- +layout: default +title: Percentile ranks +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 80 +redirect_from: + - /query-dsl/aggregations/metric/percentile-ranks/ +--- + +# Percentile rank aggregations + +Percentile rank is the percentile of values at or below a threshold grouped by a specified value. For example, if a value is greater than or equal to 80% of the values, it has a percentile rank of 80. + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "percentile_rank_taxful_total_price": { + "percentile_ranks": { + "field": "taxful_total_price", + "values": [ + 10, + 15 + ] + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "percentile_rank_taxful_total_price" : { + "values" : { + "10.0" : 0.055096056411283456, + "15.0" : 0.0830092961834656 + } + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md new file mode 100644 index 00000000..c68b0e0e --- /dev/null +++ b/_aggregations/metric/percentile.md @@ -0,0 +1,54 @@ +--- +layout: default +title: Percentile +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 90 +redirect_from: + - /query-dsl/aggregations/metric/percentile/ +--- + +# Percentile aggregations + +Percentile is the percentage of the data that's at or below a certain threshold value. + +The `percentile` metric is a multi-value metric aggregation that lets you find outliers in your data or figure out the distribution of your data. + +Like the `cardinality` metric, the `percentile` metric is also approximate. + +The following example calculates the percentile in relation to the `taxful_total_price` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "percentile_taxful_total_price": { + "percentiles": { + "field": "taxful_total_price" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "percentile_taxful_total_price" : { + "values" : { + "1.0" : 21.984375, + "5.0" : 27.984375, + "25.0" : 44.96875, + "50.0" : 64.22061688311689, + "75.0" : 93.0, + "95.0" : 156.0, + "99.0" : 222.0 + } + } + } +} +``` diff --git a/_aggregations/metric/scripted-metric.md b/_aggregations/metric/scripted-metric.md new file mode 100644 index 00000000..d1807efb --- /dev/null +++ b/_aggregations/metric/scripted-metric.md @@ -0,0 +1,73 @@ +--- +layout: default +title: Scripted metric +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 100 +redirect_from: + - /query-dsl/aggregations/metric/scripted-metric/ +--- + +# Scripted metric aggregations + +The `scripted_metric` metric is a multi-value metric aggregation that returns metrics calculated from a specified script. + +A script has four stages: the initial stage, the map stage, the combine stage, and the reduce stage. + +* `init_script`: (OPTIONAL) Sets the initial state and executes before any collection of documents. +* `map_script`: Checks the value of the `type` field and executes the aggregation on the collected documents. +* `combine_script`: Aggregates the state returned from every shard. The aggregated value is returned to the coordinating node. +* `reduce_script`: Provides access to the variable states; this variable combines the results from the `combine_script` on each shard into an array. + +The following example aggregates the different HTTP response types in web log data: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggregations": { + "responses.counts": { + "scripted_metric": { + "init_script": "state.responses = ['error':0L,'success':0L,'other':0L]", + "map_script": """ + def code = doc['response.keyword'].value; + if (code.startsWith('5') || code.startsWith('4')) { + state.responses.error += 1 ; + } else if(code.startsWith('2')) { + state.responses.success += 1; + } else { + state.responses.other += 1; + } + """, + "combine_script": "state.responses", + "reduce_script": """ + def counts = ['error': 0L, 'success': 0L, 'other': 0L]; + for (responses in states) { + counts.error += responses['error']; + counts.success += responses['success']; + counts.other += responses['other']; + } + return counts; + """ + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "responses.counts" : { + "value" : { + "other" : 0, + "success" : 12832, + "error" : 1242 + } + } + } +} +``` diff --git a/_aggregations/metric/stats.md b/_aggregations/metric/stats.md new file mode 100644 index 00000000..0a548315 --- /dev/null +++ b/_aggregations/metric/stats.md @@ -0,0 +1,46 @@ +--- +layout: default +title: Stats +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 110 +redirect_from: + - /query-dsl/aggregations/metric/stats/ +--- + +# Stats aggregations + +The `stats` metric is a multi-value metric aggregation that returns all basic metrics such as `min`, `max`, `sum`, `avg`, and `value_count` in one aggregation query. + +The following example returns the basic stats for the `taxful_total_price` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "stats_taxful_total_price": { + "stats": { + "field": "taxful_total_price" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "stats_taxful_total_price" : { + "count" : 4675, + "min" : 6.98828125, + "max" : 2250.0, + "avg" : 75.05542864304813, + "sum" : 350884.12890625 + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/metric/sum.md b/_aggregations/metric/sum.md new file mode 100644 index 00000000..0320de63 --- /dev/null +++ b/_aggregations/metric/sum.md @@ -0,0 +1,58 @@ +--- +layout: default +title: Sum +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 120 +redirect_from: + - /query-dsl/aggregations/metric/sum/ +--- + +# Sum aggregations + +The `sum` metric is a single-value metric aggregations that returns the sum of the values of a field. + +The following example calculates the total sum of the `taxful_total_price` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "sum_taxful_total_price": { + "sum": { + "field": "taxful_total_price" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 16, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "sum_taxful_total_price": { + "value": 350884.12890625 + } + } +} +``` diff --git a/_aggregations/metric/top-hits.md b/_aggregations/metric/top-hits.md new file mode 100644 index 00000000..b6752300 --- /dev/null +++ b/_aggregations/metric/top-hits.md @@ -0,0 +1,149 @@ +--- +layout: default +title: Top hits +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 130 +redirect_from: + - /query-dsl/aggregations/metric/top-hits/ +--- + +# Top hits aggregations + +The `top_hits` metric is a multi-value metric aggregation that ranks the matching documents based on a relevance score for the field that's being aggregated. + +You can specify the following options: + +- `from`: The starting position of the hit. +- `size`: The maximum size of hits to return. The default value is 3. +- `sort`: How the matching hits are sorted. By default, the hits are sorted by the relevance score of the aggregation query. + +The following example returns the top 5 products in your eCommerce data: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "top_hits_products": { + "top_hits": { + "size": 5 + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... +"aggregations" : { + "top_hits_products" : { + "hits" : { + "total" : { + "value" : 4675, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "opensearch_dashboards_sample_data_ecommerce", + "_type" : "_doc", + "_id" : "glMlwXcBQVLeQPrkHPtI", + "_score" : 1.0, + "_source" : { + "category" : [ + "Women's Accessories", + "Women's Clothing" + ], + "currency" : "EUR", + "customer_first_name" : "rania", + "customer_full_name" : "rania Evans", + "customer_gender" : "FEMALE", + "customer_id" : 24, + "customer_last_name" : "Evans", + "customer_phone" : "", + "day_of_week" : "Sunday", + "day_of_week_i" : 6, + "email" : "rania@evans-family.zzz", + "manufacturer" : [ + "Tigress Enterprises" + ], + "order_date" : "2021-02-28T14:16:48+00:00", + "order_id" : 583581, + "products" : [ + { + "base_price" : 10.99, + "discount_percentage" : 0, + "quantity" : 1, + "manufacturer" : "Tigress Enterprises", + "tax_amount" : 0, + "product_id" : 19024, + "category" : "Women's Accessories", + "sku" : "ZO0082400824", + "taxless_price" : 10.99, + "unit_discount_amount" : 0, + "min_price" : 5.17, + "_id" : "sold_product_583581_19024", + "discount_amount" : 0, + "created_on" : "2016-12-25T14:16:48+00:00", + "product_name" : "Snood - white/grey/peach", + "price" : 10.99, + "taxful_price" : 10.99, + "base_unit_price" : 10.99 + }, + { + "base_price" : 32.99, + "discount_percentage" : 0, + "quantity" : 1, + "manufacturer" : "Tigress Enterprises", + "tax_amount" : 0, + "product_id" : 19260, + "category" : "Women's Clothing", + "sku" : "ZO0071900719", + "taxless_price" : 32.99, + "unit_discount_amount" : 0, + "min_price" : 17.15, + "_id" : "sold_product_583581_19260", + "discount_amount" : 0, + "created_on" : "2016-12-25T14:16:48+00:00", + "product_name" : "Cardigan - grey", + "price" : 32.99, + "taxful_price" : 32.99, + "base_unit_price" : 32.99 + } + ], + "sku" : [ + "ZO0082400824", + "ZO0071900719" + ], + "taxful_total_price" : 43.98, + "taxless_total_price" : 43.98, + "total_quantity" : 2, + "total_unique_products" : 2, + "type" : "order", + "user" : "rani", + "geoip" : { + "country_iso_code" : "EG", + "location" : { + "lon" : 31.3, + "lat" : 30.1 + }, + "region_name" : "Cairo Governorate", + "continent_name" : "Africa", + "city_name" : "Cairo" + }, + "event" : { + "dataset" : "sample_ecommerce" + } + } + ... + } + ] + } + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/metric/value-count.md b/_aggregations/metric/value-count.md new file mode 100644 index 00000000..dfddaf94 --- /dev/null +++ b/_aggregations/metric/value-count.md @@ -0,0 +1,42 @@ +--- +layout: default +title: Value count +parent: Metric aggregations +grand_parent: Aggregations +nav_order: 140 +redirect_from: + - /query-dsl/aggregations/metric/value-count/ +--- + +# Value count aggregations + +The `value_count` metric is a single-value metric aggregation that calculates the number of values that an aggregation is based on. + +For example, you can use the `value_count` metric with the `avg` metric to find how many numbers the aggregation uses to calculate an average value. + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "number_of_values": { + "value_count": { + "field": "taxful_total_price" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +... + "aggregations" : { + "number_of_values" : { + "value" : 4675 + } + } +} +``` \ No newline at end of file diff --git a/_opensearch/pipeline-agg.md b/_aggregations/pipeline-agg.md similarity index 97% rename from _opensearch/pipeline-agg.md rename to _aggregations/pipeline-agg.md index 892cb19c..30fb0ecf 100644 --- a/_opensearch/pipeline-agg.md +++ b/_aggregations/pipeline-agg.md @@ -1,9 +1,11 @@ --- layout: default -title: Pipeline Aggregations -parent: Aggregations -nav_order: 4 +title: Pipeline aggregations +nav_order: 5 has_children: false +redirect_from: + - /opensearch/pipeline-agg/ + - /query-dsl/aggregations/pipeline-agg/ --- # Pipeline aggregations @@ -14,7 +16,7 @@ You can use pipeline aggregations to compute complex statistical and mathematica ## Pipeline aggregation syntax -A pipeline aggregation uses the the `buckets_path` property to access the results of other aggregations. +A pipeline aggregation uses the `buckets_path` property to access the results of other aggregations. The `buckets_path` property has a specific syntax: ``` @@ -66,7 +68,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json ... @@ -149,7 +151,7 @@ POST opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json ... @@ -224,7 +226,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json ... @@ -241,7 +243,7 @@ GET opensearch_dashboards_sample_data_logs/_search The `extended_stats` aggregation is an extended version of the `stats` aggregation. Apart from including basic stats, `extended_stats` also provides stats such as `sum_of_squares`, `variance`, and `std_deviation`. -#### Sample response +#### Example response ```json "stats_monthly_visits" : { @@ -343,7 +345,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json "aggregations" : { @@ -439,7 +441,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json "aggregations" : { @@ -529,7 +531,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json "aggregations" : { @@ -602,7 +604,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json ... @@ -650,11 +652,11 @@ GET opensearch_dashboards_sample_data_logs/_search ## derivative -The `derivative` aggregation is a parent aggregation that calculates 1st order and 2nd order derivates of each bucket of a previous aggregation. +The `derivative` aggregation is a parent aggregation that calculates 1st order and 2nd order derivatives of each bucket of a previous aggregation. -In mathematics, the derivative of a function measures its sensitivity to change. In other words, a derivative evaluates the rate of change in some function with respect to some variable. To learn more about derivates, see [Wikipedia](https://en.wikipedia.org/wiki/Derivative). +In mathematics, the derivative of a function measures its sensitivity to change. In other words, a derivative evaluates the rate of change in some function with respect to some variable. To learn more about derivatives, see [Wikipedia](https://en.wikipedia.org/wiki/Derivative). -You can use derivates to calculate the rate of change of numeric values compared to its previous time periods. +You can use derivatives to calculate the rate of change of numeric values compared to its previous time periods. The 1st order derivative indicates whether a metric is increasing or decreasing, and by how much it's increasing or decreasing. @@ -687,7 +689,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json ... @@ -767,7 +769,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json ... @@ -866,7 +868,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json ... @@ -942,7 +944,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json "aggregations" : { @@ -1060,7 +1062,7 @@ GET opensearch_dashboards_sample_data_logs/_search ``` -#### Sample response +#### Example response ```json ... @@ -1137,7 +1139,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json ... @@ -1220,7 +1222,7 @@ GET opensearch_dashboards_sample_data_logs/_search } ``` -#### Sample response +#### Example response ```json ... diff --git a/_analyzers/index-analyzers.md b/_analyzers/index-analyzers.md new file mode 100644 index 00000000..72332758 --- /dev/null +++ b/_analyzers/index-analyzers.md @@ -0,0 +1,65 @@ +--- +layout: default +title: Index analyzers +nav_order: 20 +--- + +# Index analyzers + +Index analyzers are specified at indexing time and are used to analyze [text]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) fields when indexing a document. + +## Determining which index analyzer to use + +To determine which analyzer to use for a field when a document is indexed, OpenSearch examines the following parameters in order: + +1. The `analyzer` mapping parameter of the field +1. The `analysis.analyzer.default` index setting +1. The `standard` analyzer (default) + +When specifying an index analyzer, keep in mind that in most cases, specifying an analyzer for each `text` field in an index works best. Analyzing both the text field (at indexing time) and the query string (at query time) with the same analyzer ensures that the search uses the same terms as those that are stored in the index. +{: .important } + +For information about verifying which analyzer is associated with which field, see [Verifying analyzer settings]({{site.url}}{{site.baseurl}}/analyzers/index/#verifying-analyzer-settings). + +## Specifying an index analyzer for a field + +When creating index mappings, you can supply the `analyzer` parameter for each [text]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) field. For example, the following request specifies the `simple` analyzer for the `text_entry` field: + +```json +PUT testindex +{ + "mappings": { + "properties": { + "text_entry": { + "type": "text", + "analyzer": "simple" + } + } + } +} +``` +{% include copy-curl.html %} + +## Specifying a default index analyzer for an index + +If you want to use the same analyzer for all text fields in an index, you can specify it in the `analysis.analyzer.default` setting as follows: + +```json +PUT testindex +{ + "settings": { + "analysis": { + "analyzer": { + "default": { + "type": "simple" + } + } + } + } +} +``` +{% include copy-curl.html %} + +If you don't specify a default analyzer, the `standard` analyzer is used. +{: .note} + diff --git a/_analyzers/index.md b/_analyzers/index.md new file mode 100644 index 00000000..95f97ec8 --- /dev/null +++ b/_analyzers/index.md @@ -0,0 +1,175 @@ +--- +layout: default +title: Text analysis +has_children: true +nav_order: 5 +nav_exclude: true +has_toc: false +permalink: /analyzers/ +redirect_from: + - /opensearch/query-dsl/text-analyzers/ + - /query-dsl/analyzers/text-analyzers/ + - /analyzers/text-analyzers/ + - /analyzers/index/ +--- + +# Text analysis + +When you are searching documents using a full-text search, you want to receive all relevant results. If you're looking for "walk", you're interested in results that contain any form of the word, like "Walk", "walked", or "walking". To facilitate full-text search, OpenSearch uses text analysis. + +The objective of text analysis is to split the unstructured free text content of the source document into a sequence of terms, which are then stored in an inverted index. Subsequently, when a similar text analysis is applied to a user's query, the resulting sequence of terms facilitates the matching of relevant source documents. + +From a technical point of view, the text analysis process consists of several steps, some of which are optional: + +1. Before the free text content can be split into individual words, it may be beneficial to refine the text at the character level. The primary aim of this optional step is to help the tokenizer (the subsequent stage in the analysis process) generate better tokens. This can include removal of markup tags (such as HTML) or handling specific character patterns (like replacing the 🙂 emoji with the text `:slightly_smiling_face:`). + +2. The next step is to split the free text into individual words---_tokens_. This is performed by a _tokenizer_. For example, after tokenization, the sentence `Actions speak louder than words` is split into tokens `Actions`, `speak`, `louder`, `than`, and `words`. + +3. The last step is to process individual tokens by applying a series of token filters. The aim is to convert each token into a predictable form that is directly stored in the index, for example, by converting them to lowercase or performing stemming (reducing the word to its root). For example, the token `Actions` becomes `action`, `louder` becomes `loud`, and `words` becomes `word`. + +Although the terms ***token*** and ***term*** may sound similar and are occasionally used interchangeably, it is helpful to understand the difference between the two. In the context of Apache Lucene, each holds a distinct role. A ***token*** is created by a tokenizer during text analysis and often undergoes a number of additional modifications as it passes through the chain of token filters. Each token is associated with metadata that can be further used during the text analysis process. A ***term*** is a data value that is directly stored in the inverted index and is associated with much less metadata. During search, matching operates at the term level. +{: .note} + +## Analyzers + +In OpenSearch, the abstraction that encompasses text analysis is referred to as an _analyzer_. Each analyzer contains the following sequentially applied components: + +1. **Character filters**: First, a character filter receives the original text as a stream of characters and adds, removes, or modifies characters in the text. For example, a character filter can strip HTML characters from a string so that the text `

Actions speak louder than words

` becomes `\nActions speak louder than words\n`. The output of a character filter is a stream of characters. + +1. **Tokenizer**: Next, a tokenizer receives the stream of characters that has been processed by the character filter and splits the text into individual _tokens_ (usually, words). For example, a tokenizer can split text on white space so that the preceding text becomes [`Actions`, `speak`, `louder`, `than`, `words`]. Tokenizers also maintain metadata about tokens, such as their starting and ending positions in the text. The output of a tokenizer is a stream of tokens. + +1. **Token filters**: Last, a token filter receives the stream of tokens from the tokenizer and adds, removes, or modifies tokens. For example, a token filter may lowercase the tokens so that `Actions` becomes `action`, remove stopwords like `than`, or add synonyms like `talk` for the word `speak`. + +An analyzer must contain exactly one tokenizer and may contain zero or more character filters and zero or more token filters. +{: .note} + +There is also a special type of analyzer called a ***normalizer***. A normalizer is similar to an analyzer except that it does not contain a tokenizer and can only include specific types of character filters and token filters. These filters can perform only character-level operations, such as character or pattern replacement, and cannot perform operations on the token as a whole. This means that replacing a token with a synonym or stemming is not supported. See [Normalizers]({{site.url}}{{site.baseurl}}/analyzers/normalizers/) for further details. + +## Built-in analyzers + +The following table lists the built-in analyzers that OpenSearch provides. The last column of the table contains the result of applying the analyzer to the string `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`. + +Analyzer | Analysis performed | Analyzer output +:--- | :--- | :--- +**Standard** (default) | - Parses strings into tokens at word boundaries
- Removes most punctuation
- Converts tokens to lowercase | [`it’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] +**Simple** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Converts tokens to lowercase | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `to`, `opensearch`] +**Whitespace** | - Parses strings into tokens on white space | [`It’s`, `fun`, `to`, `contribute`, `a`,`brand-new`, `PR`, `or`, `2`, `to`, `OpenSearch!`] +**Stop** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Removes stop words
- Converts tokens to lowercase | [`s`, `fun`, `contribute`, `brand`, `new`, `pr`, `opensearch`] +**Keyword** (no-op) | - Outputs the entire string unchanged | [`It’s fun to contribute a brand-new PR or 2 to OpenSearch!`] +**Pattern** | - Parses strings into tokens using regular expressions
- Supports converting strings to lowercase
- Supports removing stop words | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] +[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] +**Fingerprint** | - Parses strings on any non-letter character
- Normalizes characters by converting them to ASCII
- Converts tokens to lowercase
- Sorts, deduplicates, and concatenates tokens into a single token
- Supports removing stop words | [`2 a brand contribute fun it's new opensearch or pr to`]
Note that the apostrophe was converted to its ASCII counterpart. + +## Custom analyzers + +If needed, you can combine tokenizers, token filters, and character filters to create a custom analyzer. + +## Text analysis at indexing time and query time + +OpenSearch performs text analysis on text fields when you index a document and when you send a search request. Depending on the time of text analysis, the analyzers used for it are classified as follows: + +- An _index analyzer_ performs analysis at indexing time: When you are indexing a [text]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) field, OpenSearch analyzes it before indexing it. For more information about ways to specify index analyzers, see [Index analyzers]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/). + +- A _search analyzer_ performs analysis at query time: OpenSearch analyzes the query string when you run a full-text query on a text field. For more information about ways to specify search analyzers, see [Search analyzers]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/). + +In most cases, you should use the same analyzer at both indexing and search time because the text field and the query string will be analyzed in the same way and the resulting tokens will match as expected. +{: .tip} + +### Example + +When you index a document that has a text field with the text `Actions speak louder than words`, OpenSearch analyzes the text and produces the following list of tokens: + +Text field tokens = [`action`, `speak`, `loud`, `than`, `word`] + +When you search for documents that match the query `speaking loudly`, OpenSearch analyzes the query string and produces the following list of tokens: + +Query string tokens = [`speak`, `loud`] + +Then OpenSearch compares each token in the query string against the list of text field tokens and finds that both lists contain the tokens `speak` and `loud`, so OpenSearch returns this document as part of the search results that match the query. + +## Testing an analyzer + +To test a built-in analyzer and view the list of tokens it generates when a document is indexed, you can use the [Analyze API]({{site.url}}{{site.baseurl}}/api-reference/analyze-apis/#apply-a-built-in-analyzer). + +Specify the analyzer and the text to be analyzed in the request: + +```json +GET /_analyze +{ + "analyzer" : "standard", + "text" : "Let’s contribute to OpenSearch!" +} +``` +{% include copy-curl.html %} + +The following image shows the query string. + +![Query string with indices]({{site.url}}{{site.baseurl}}/images/string-indices.png) + +The response contains each token and its start and end offsets that correspond to the starting index in the original string (inclusive) and the ending index (exclusive): + +```json +{ + "tokens": [ + { + "token": "let’s", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "contribute", + "start_offset": 6, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "to", + "start_offset": 17, + "end_offset": 19, + "type": "", + "position": 2 + }, + { + "token": "opensearch", + "start_offset": 20, + "end_offset": 30, + "type": "", + "position": 3 + } + ] +} +``` + +## Verifying analyzer settings + +To verify which analyzer is associated with which field, you can use the get mapping API operation: + +```json +GET /testindex/_mapping +``` +{% include copy-curl.html %} + +The response provides information about the analyzers for each field: + +```json +{ + "testindex": { + "mappings": { + "properties": { + "text_entry": { + "type": "text", + "analyzer": "simple", + "search_analyzer": "whitespace" + } + } + } + } +} +``` + +## Next steps + +- Learn more about specifying [index analyzers]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) and [search analyzers]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/). \ No newline at end of file diff --git a/_analyzers/language-analyzers.md b/_analyzers/language-analyzers.md new file mode 100644 index 00000000..f5a2f18c --- /dev/null +++ b/_analyzers/language-analyzers.md @@ -0,0 +1,43 @@ +--- +layout: default +title: Language analyzers +nav_order: 10 +redirect_from: + - /query-dsl/analyzers/language-analyzers/ +--- + +# Language analyzer + +OpenSearch supports the following language values with the `analyzer` option: +`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `turkish`, and `thai`. + +To use the analyzer when you map an index, specify the value within your query. For example, to map your index with the French language analyzer, specify the `french` value for the analyzer field: + +```json + "analyzer": "french" +``` + +#### Example request + +The following query specifies the `french` language analyzer for the index `my-index`: + +```json +PUT my-index +{ + "mappings": { + "properties": { + "text": { + "type": "text", + "fields": { + "french": { + "type": "text", + "analyzer": "french" + } + } + } + } + } +} +``` + + \ No newline at end of file diff --git a/_analyzers/normalizers.md b/_analyzers/normalizers.md new file mode 100644 index 00000000..b89659f8 --- /dev/null +++ b/_analyzers/normalizers.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Normalizers +nav_order: 100 +--- + +# Normalizers + +A _normalizer_ functions similarly to an analyzer but outputs only a single token. It does not contain a tokenizer and can only include specific types of character and token filters. These filters can perform only character-level operations, such as character or pattern replacement, and cannot operate on the token as a whole. This means that replacing a token with a synonym or stemming is not supported. + +A normalizer is useful in keyword search (that is, in term-based queries) because it allows you to run token and character filters on any given input. For instance, it makes it possible to match an incoming query `Naïve` with the index term `naive`. + +Consider the following example. + +Create a new index with a custom normalizer: +```json +PUT /sample-index +{ + "settings": { + "analysis": { + "normalizer": { + "normalized_keyword": { + "type": "custom", + "char_filter": [], + "filter": [ "asciifolding", "lowercase" ] + } + } + } + }, + "mappings": { + "properties": { + "approach": { + "type": "keyword", + "normalizer": "normalized_keyword" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document: +```json +POST /sample-index/_doc/ +{ + "approach": "naive" +} +``` +{% include copy-curl.html %} + +The following query matches the document. This is expected: +```json +GET /sample-index/_search +{ + "query": { + "term": { + "approach": "naive" + } + } +} +``` +{% include copy-curl.html %} + +But this query matches the document as well: +```json +GET /sample-index/_search +{ + "query": { + "term": { + "approach": "Naïve" + } + } +} +``` +{% include copy-curl.html %} + +To understand why, consider the effect of the normalizer: +```json +GET /sample-index/_analyze +{ + "normalizer" : "normalized_keyword", + "text" : "Naïve" +} +``` + +Internally, a normalizer accepts only filters that are instances of either `NormalizingTokenFilterFactory` or `NormalizingCharFilterFactory`. The following is a list of compatible filters found in modules and plugins that are part of the core OpenSearch repository. + +### The `common-analysis` module + +This module does not require installation; it is available by default. + +Character filters: `pattern_replace`, `mapping` + +Token filters: `arabic_normalization`, `asciifolding`, `bengali_normalization`, `cjk_width`, `decimal_digit`, `elision`, `german_normalization`, `hindi_normalization`, `indic_normalization`, `lowercase`, `persian_normalization`, `scandinavian_folding`, `scandinavian_normalization`, `serbian_normalization`, `sorani_normalization`, `trim`, `uppercase` + +### The `analysis-icu` plugin + +Character filters: `icu_normalizer` + +Token filters: `icu_normalizer`, `icu_folding`, `icu_transform` + +### The `analysis-kuromoji` plugin + +Character filters: `normalize_kanji`, `normalize_kana` + +### The `analysis-nori` plugin + +Character filters: `normalize_kanji`, `normalize_kana` + +These lists of filters include only analysis components found in the [additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins) that are part of the core OpenSearch repository. +{: .note} \ No newline at end of file diff --git a/_analyzers/search-analyzers.md b/_analyzers/search-analyzers.md new file mode 100644 index 00000000..b47e739d --- /dev/null +++ b/_analyzers/search-analyzers.md @@ -0,0 +1,93 @@ +--- +layout: default +title: Search analyzers +nav_order: 30 +--- + +# Search analyzers + +Search analyzers are specified at query time and are used to analyze the query string when you run a full-text query on a [text]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) field. + +## Determining which search analyzer to use + +To determine which analyzer to use for a query string at query time, OpenSearch examines the following parameters in order: + +1. The `analyzer` parameter of the query +1. The `search_analyzer` mapping parameter of the field +1. The `analysis.analyzer.default_search` index setting +1. The `analyzer` mapping parameter of the field +1. The `standard` analyzer (default) + +In most cases, specifying a search analyzer that is different from the index analyzer is not necessary and could negatively impact search result relevance or lead to unexpected search results. +{: .warning} + +For information about verifying which analyzer is associated with which field, see [Verifying analyzer settings]({{site.url}}{{site.baseurl}}/analyzers/index/#verifying-analyzer-settings). + +## Specifying a search analyzer for a query string + +Specify the name of the analyzer you want to use at query time in the `analyzer` field: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": { + "query": "speak the truth", + "analyzer": "english" + } + } + } +} +``` +{% include copy-curl.html %} + +Valid values for [built-in analyzers]({{site.url}}{{site.baseurl}}/analyzers/index#built-in-analyzers) are `standard`, `simple`, `whitespace`, `stop`, `keyword`, `pattern`, `fingerprint`, or any supported [language analyzer]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/). + +## Specifying a search analyzer for a field + +When creating index mappings, you can provide the `search_analyzer` parameter for each [text]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) field. When providing the `search_analyzer`, you must also provide the `analyzer` parameter, which specifies the [index analyzer]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) to be used at indexing time. + +For example, the following request specifies the `simple` analyzer as the index analyzer and the `whitespace` analyzer as the search analyzer for the `text_entry` field: + +```json +PUT testindex +{ + "mappings": { + "properties": { + "text_entry": { + "type": "text", + "analyzer": "simple", + "search_analyzer": "whitespace" + } + } + } +} +``` +{% include copy-curl.html %} + +## Specifying the default search analyzer for an index + +If you want to analyze all query strings at search time with the same analyzer, you can specify the search analyzer in the `analysis.analyzer.default_search` setting. When providing the `analysis.analyzer.default_search`, you must also provide the `analysis.analyzer.default` parameter, which specifies the [index analyzer]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) to be used at indexing time. + +For example, the following request specifies the `simple` analyzer as the index analyzer and the `whitespace` analyzer as the search analyzer for the `testindex` index: + +```json +PUT testindex +{ + "settings": { + "analysis": { + "analyzer": { + "default": { + "type": "simple" + }, + "default_search": { + "type": "whitespace" + } + } + } + } +} + +``` +{% include copy-curl.html %} diff --git a/_analyzers/token-filters/delimited-term-frequency.md b/_analyzers/token-filters/delimited-term-frequency.md new file mode 100644 index 00000000..e2db7644 --- /dev/null +++ b/_analyzers/token-filters/delimited-term-frequency.md @@ -0,0 +1,275 @@ +--- +layout: default +title: Delimited term frequency +parent: Token filters +nav_order: 100 +--- + +# Delimited term frequency token filter + +The `delimited_term_freq` token filter separates a token stream into tokens with corresponding term frequencies, based on a provided delimiter. A token consists of all characters before the delimiter, and a term frequency is the integer after the delimiter. For example, if the delimiter is `|`, then for the string `foo|5`, `foo` is the token and `5` is its term frequency. If there is no delimiter, the token filter does not modify the term frequency. + +You can either use a preconfigured `delimited_term_freq` token filter or create a custom one. + +## Preconfigured `delimited_term_freq` token filter + +The preconfigured `delimited_term_freq` token filter uses the `|` default delimiter. To analyze text with the preconfigured token filter, send the following request to the `_analyze` endpoint: + +```json +POST /_analyze +{ + "text": "foo|100", + "tokenizer": "keyword", + "filter": ["delimited_term_freq"], + "attributes": ["termFrequency"], + "explain": true +} +``` +{% include copy-curl.html %} + +The `attributes` array specifies that you want to filter the output of the `explain` parameter to return only `termFrequency`. The response contains both the original token and the parsed output of the token filter that includes the term frequency: + +```json +{ + "detail": { + "custom_analyzer": true, + "charfilters": [], + "tokenizer": { + "name": "keyword", + "tokens": [ + { + "token": "foo|100", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0, + "termFrequency": 1 + } + ] + }, + "tokenfilters": [ + { + "name": "delimited_term_freq", + "tokens": [ + { + "token": "foo", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0, + "termFrequency": 100 + } + ] + } + ] + } +} +``` + +## Custom `delimited_term_freq` token filter + +To configure a custom `delimited_term_freq` token filter, first specify the delimiter in the mapping request, in this example, `^`: + +```json +PUT /testindex +{ + "settings": { + "analysis": { + "filter": { + "my_delimited_term_freq": { + "type": "delimited_term_freq", + "delimiter": "^" + } + } + } + } +} +``` +{% include copy-curl.html %} + +Then analyze text with the custom token filter you created: + +```json +POST /testindex/_analyze +{ + "text": "foo^3", + "tokenizer": "keyword", + "filter": ["my_delimited_term_freq"], + "attributes": ["termFrequency"], + "explain": true +} +``` +{% include copy-curl.html %} + +The response contains both the original token and the parsed version with the term frequency: + +```json +{ + "detail": { + "custom_analyzer": true, + "charfilters": [], + "tokenizer": { + "name": "keyword", + "tokens": [ + { + "token": "foo|100", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0, + "termFrequency": 1 + } + ] + }, + "tokenfilters": [ + { + "name": "delimited_term_freq", + "tokens": [ + { + "token": "foo", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0, + "termFrequency": 100 + } + ] + } + ] + } +} +``` + +## Combining `delimited_token_filter` with scripts + +You can write Painless scripts to calculate custom scores for the documents in the results. + +First, create an index and provide the following mappings and settings: + +```json +PUT /test +{ + "settings": { + "number_of_shards": 1, + "analysis": { + "tokenizer": { + "keyword_tokenizer": { + "type": "keyword" + } + }, + "filter": { + "my_delimited_term_freq": { + "type": "delimited_term_freq", + "delimiter": "^" + } + }, + "analyzer": { + "custom_delimited_analyzer": { + "tokenizer": "keyword_tokenizer", + "filter": ["my_delimited_term_freq"] + } + } + } + }, + "mappings": { + "properties": { + "f1": { + "type": "keyword" + }, + "f2": { + "type": "text", + "analyzer": "custom_delimited_analyzer", + "index_options": "freqs" + } + } + } +} +``` +{% include copy-curl.html %} + +The `test` index uses a keyword tokenizer, a delimited term frequency token filter (where the delimiter is `^`), and a custom analyzer that includes a keyword tokenizer and a delimited term frequency token filter. The mappings specify that the field `f1` is a keyword field and the field `f2` is a text field. The field `f2` uses the custom analyzer defined in the settings for text analysis. Additionally, specifying `index_options` signals to OpenSearch to add the term frequencies to the inverted index. You'll use the term frequencies to give documents with repeated terms a higher score. + +Next, index two documents using bulk upload: + +```json +POST /_bulk?refresh=true +{"index": {"_index": "test", "_id": "doc1"}} +{"f1": "v0|100", "f2": "v1^30"} +{"index": {"_index": "test", "_id": "doc2"}} +{"f2": "v2|100"} +``` +{% include copy-curl.html %} + +The following query searches for all documents in the index and calculates document scores as the term frequency of the term `v1` in the field `f2`: + +```json +GET /test/_search +{ + "query": { + "function_score": { + "query": { + "match_all": {} + }, + "script_score": { + "script": { + "source": "termFreq(params.field, params.term)", + "params": { + "field": "f2", + "term": "v1" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +In the response, document 1 has a score of 30 because the term frequency of the term `v1` in the field `f2` is 30. Document 2 has a score of 0 because the term `v1` does not appear in `f2`: + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 30, + "hits": [ + { + "_index": "test", + "_id": "doc1", + "_score": 30, + "_source": { + "f1": "v0|100", + "f2": "v1^30" + } + }, + { + "_index": "test", + "_id": "doc2", + "_score": 0, + "_source": { + "f2": "v2|100" + } + } + ] + } +} +``` + +## Parameters + +The following table lists all parameters that the `delimited_term_freq` supports. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`delimiter` | Optional | The delimiter used to separate tokens from term frequencies. Must be a single non-null character. Default is `|`. \ No newline at end of file diff --git a/_analyzers/token-filters/index.md b/_analyzers/token-filters/index.md new file mode 100644 index 00000000..ba09a7fa --- /dev/null +++ b/_analyzers/token-filters/index.md @@ -0,0 +1,64 @@ +--- +layout: default +title: Token filters +nav_order: 70 +has_children: true +has_toc: false +--- + +# Token filters + +Token filters receive the stream of tokens from the tokenizer and add, remove, or modify the tokens. For example, a token filter may lowercase the tokens so that `Actions` becomes `action`, remove stopwords like `than`, or add synonyms like `talk` for the word `speak`. + +The following table lists all token filters that OpenSearch supports. + +Token filter | Underlying Lucene token filter| Description +`apostrophe` | [ApostropheFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/tr/ApostropheFilter.html) | In each token that contains an apostrophe, the `apostrophe` token filter removes the apostrophe itself and all characters following the apostrophe. +`asciifolding` | [ASCIIFoldingFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html) | Converts alphabetic, numeric, and symbolic characters. +`cjk_bigram` | [CJKBigramFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. +`cjk_width` | [CJKWidthFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html) | Normalizes Chinese, Japanese, and Korean (CJK) tokens according to the following rules:
- Folds full-width ASCII character variants into the equivalent basic Latin characters.
- Folds half-width Katakana character variants into the equivalent Kana characters. +`classic` | [ClassicFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/standard/ClassicFilter.html) | Performs optional post-processing on the tokens generated by the classic tokenizer. Removes possessives (`'s`) and removes `.` from acronyms. +`common_grams` | [CommonGramsFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html) | Generates bigrams for a list of frequently occurring terms. The output contains both single terms and bigrams. +`conditional` | [ConditionalTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.html) | Applies an ordered list of token filters to tokens that match the conditions provided in a script. +`decimal_digit` | [DecimalDigitFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). +`delimited_payload` | [DelimitedPayloadTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html) | Separates a token stream into tokens with corresponding payloads, based on a provided delimiter. A token consists of all characters before the delimiter, and a payload consists of all characters after the delimiter. For example, if the delimiter is `|`, then for the string `foo|bar`, `foo` is the token and `bar` is the payload. +[`delimited_term_freq`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-term-frequency/) | [DelimitedTermFrequencyTokenFilter](https://lucene.apache.org/core/9_7_0/analysis/common/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.html) | Separates a token stream into tokens with corresponding term frequencies, based on a provided delimiter. A token consists of all characters before the delimiter, and a term frequency is the integer after the delimiter. For example, if the delimiter is `|`, then for the string `foo|5`, `foo` is the token and `5` is the term frequency. +`dictionary_decompounder` | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. +`edge_ngram` | [EdgeNGramTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. +`elision` | [ElisionFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). +`fingerprint` | [FingerprintFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. +`flatten_graph` | [FlattenGraphFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. +`hunspell` | [HunspellStemFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell supports a word having multiple stems, this filter can emit multiple tokens for each consumed token. Requires you to configure one or more language-specific Hunspell dictionaries. +`hyphenation_decompounder` | [HyphenationCompoundWordTokenFilter](https://lucene.apache.org/core/9_8_0/analysis/common/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.html) | Uses XML-based hyphenation patterns to find potential subwords in compound words and checks the subwords against the specified word list. The token output contains only the subwords found in the word list. +`keep_types` | [TypeTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. +`keep_word` | [KeepWordFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. +`keyword_marker` | [KeywordMarkerFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. +`keyword_repeat` | [KeywordRepeatFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. +`kstem` | [KStemFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides kstem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. +`length` | [LengthFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens whose lengths are shorter or longer than the length range specified by `min` and `max`. +`limit` | [LimitTokenCountFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. A common use case is to limit the size of document field values based on token count. +`lowercase` | [LowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/LowerCaseFilter.html) is for the English language. You can set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). +`min_hash` | [MinHashFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. +`multiplexer` | N/A | Emits multiple tokens at the same position. Runs each token through each of the specified filter lists separately and outputs the results as separate tokens. +`ngram` | [NGramTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. +Normalization | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. +`pattern_capture` | N/A | Generates a token for every capture group in the provided regular expression. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +`pattern_replace` | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +`phonetic` | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin. +`porter_stem` | [PorterStemFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. +`predicate_token_filter` | N/A | Removes tokens that don’t match the specified predicate script. Supports inline Painless scripts only. +`remove_duplicates` | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. +`reverse` | [ReverseStringFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. +`shingle` | [ShingleFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but apply to words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. +`snowball` | N/A | Stems words using a [Snowball-generated stemmer](https://snowballstem.org/). You can use the `snowball` token filter with the following languages in the `language` field: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Estonian`, `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Irish`, `Italian`, `Kp`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. +`stemmer` | N/A | Provides algorithmic stemming for the following languages in the `language` field: `arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `dutch_kp`, `english`, `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english`, `estonian`, `finnish`, `light_finnish`, `french`, `light_french`, `minimal_french`, `galician`, `minimal_galician`, `german`, `german2`, `light_german`, `minimal_german`, `greek`, `hindi`, `hungarian`, `light_hungarian`, `indonesian`, `irish`, `italian`, `light_italian`, `latvian`, `Lithuanian`, `norwegian`, `light_norwegian`, `minimal_norwegian`, `light_nynorsk`, `minimal_nynorsk`, `portuguese`, `light_portuguese`, `minimal_portuguese`, `portuguese_rslp`, `romanian`, `russian`, `light_russian`, `sorani`, `spanish`, `light_spanish`, `swedish`, `light_swedish`, `turkish`. +`stemmer_override` | N/A | Overrides stemming algorithms by applying a custom mapping so that the provided terms are not stemmed. +`stop` | [StopFilter](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/StopFilter.html) | Removes stop words from a token stream. +`synonym` | N/A | Supplies a synonym list for the analysis process. The synonym list is provided using a configuration file. +`synonym_graph` | N/A | Supplies a synonym list, including multiword synonyms, for the analysis process. +`trim` | [TrimFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space from each token in a stream. +`truncate` | [TruncateTokenFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens whose length exceeds the specified character limit. +`unique` | N/A | Ensures each token is unique by removing duplicate tokens from a stream. +`uppercase` | [UpperCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. +`word_delimiter` | [WordDelimiterFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. +`word_delimiter_graph` | [WordDelimiterGraphFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. Assigns multi-position tokens a `positionLength` attribute. diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md new file mode 100644 index 00000000..d401851f --- /dev/null +++ b/_analyzers/tokenizers/index.md @@ -0,0 +1,61 @@ +--- +layout: default +title: Tokenizers +nav_order: 60 +has_children: false +has_toc: false +--- + +# Tokenizers + +A tokenizer receives a stream of characters and splits the text into individual _tokens_. A token consists of a term (usually, a word) and metadata about this term. For example, a tokenizer can split text on white space so that the text `Actions speak louder than words.` becomes [`Actions`, `speak`, `louder`, `than`, `words.`]. + +The output of a tokenizer is a stream of tokens. Tokenizers also maintain the following metadata about tokens: + +- The **order** or **position** of each token: This information is used for word and phrase proximity queries. +- The starting and ending positions (**offsets**) of the tokens in the text: This information is used for highlighting search terms. +- The token **type**: Some tokenizers (for example, `standard`) classify tokens by type, for example, `` or ``. Simpler tokenizers (for example, `letter`) only classify tokens as type `word`. + +You can use tokenizers to define custom analyzers. + +## Built-in tokenizers + +The following tables list the built-in tokenizers that OpenSearch provides. + +### Word tokenizers + +Word tokenizers parse full text into words. + +Tokenizer | Description | Example +:--- | :--- | :--- +`standard` | - Parses strings into tokens at word boundaries
- Removes most punctuation | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `2`, `to`, `OpenSearch`] +`letter` | - Parses strings into tokens on any non-letter character
- Removes non-letter characters | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `to`, `OpenSearch`] +`lowercase` | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Converts terms to lowercase | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `to`, `opensearch`] +`whitespace` | - Parses strings into tokens at white space characters | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand-new`, `PR`, `or`, `2`, `to`, `OpenSearch!`] +`uax_url_email` | - Similar to the standard tokenizer
- Unlike the standard tokenizer, leaves URLs and email addresses as single terms | `It’s fun to contribute a brand-new PR or 2 to OpenSearch opensearch-project@github.com!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `2`, `to`, `OpenSearch`, `opensearch-project@github.com`] +`classic` | - Parses strings into tokens on:
  - Punctuation characters that are followed by a white space character
  - Hyphens if the term does not contain numbers
- Removes punctuation
- Leaves URLs and email addresses as single terms | `Part number PA-35234, single-use product (128.32)`
becomes
[`Part`, `number`, `PA-35234`, `single`, `use`, `product`, `128.32`] +`thai` | - Parses Thai text into terms | `สวัสดีและยินดีต`
becomes
[`สวัสด`, `และ`, `ยินดี`, `ต`] + +### Partial word tokenizers + +Partial word tokenizers parse text into words and generate fragments of those words for partial word matching. + +Tokenizer | Description | Example +:--- | :--- | :--- +`ngram`| - Parses strings into words on specified characters (for example, punctuation or white space characters) and generates n-grams of each word | `My repo`
becomes
[`M`, `My`, `y`, `y `,  ,  r, `r`, `re`, `e`, `ep`, `p`, `po`, `o`]
because the default n-gram length is 1--2 characters +`edge_ngram` | - Parses strings into words on specified characters (for example, punctuation or white space characters) and generates edge n-grams of each word (n-grams that start at the beginning of the word) | `My repo`
becomes
[`M`, `My`]
because the default n-gram length is 1--2 characters + +### Structured text tokenizers + +Structured text tokenizers parse structured text, such as identifiers, email addresses, paths, or ZIP Codes. + +Tokenizer | Description | Example +:--- | :--- | :--- +`keyword` | - No-op tokenizer
- Outputs the entire string unchanged
- Can be combined with token filters, like lowercase, to normalize terms | `My repo`
becomes
`My repo` +`pattern` | - Uses a regular expression pattern to parse text into terms on a word separator or to capture matching text as terms
- Uses [Java regular expressions](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) | `https://opensearch.org/forum`
becomes
[`https`, `opensearch`, `org`, `forum`] because by default the tokenizer splits terms at word boundaries (`\W+`)
Can be configured with a regex pattern +`simple_pattern` | - Uses a regular expression pattern to return matching text as terms
- Uses [Lucene regular expressions](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/util/automaton/RegExp.html)
- Faster than the `pattern` tokenizer because it uses a subset of the `pattern` tokenizer regular expressions | Returns an empty array by default
Must be configured with a pattern because the pattern defaults to an empty string +`simple_pattern_split` | - Uses a regular expression pattern to split the text at matches rather than returning the matches as terms
- Uses [Lucene regular expressions](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/util/automaton/RegExp.html)
- Faster than the `pattern` tokenizer because it uses a subset of the `pattern` tokenizer regular expressions | No-op by default
Must be configured with a pattern +`char_group` | - Parses on a set of configurable characters
- Faster than tokenizers that run regular expressions | No-op by default
Must be configured with a list of characters +`path_hierarchy` | - Parses text on the path separator (by default, `/`) and returns a full path to each component in the tree hierarchy | `one/two/three`
becomes
[`one`, `one/two`, `one/two/three`] + + diff --git a/_api-reference/analyze-apis.md b/_api-reference/analyze-apis.md new file mode 100644 index 00000000..7c61a6fd --- /dev/null +++ b/_api-reference/analyze-apis.md @@ -0,0 +1,690 @@ +--- +layout: default +title: Analyze API +has_children: true +nav_order: 7 +redirect_from: + - /opensearch/rest-api/analyze-apis/ + - /api-reference/analyze-apis/ +--- + +# Analyze API +**Introduced 1.0** +{: .label .label-purple } + +The Analyze API allows you to perform [text analysis]({{site.url}}{{site.baseurl}}/api-reference/analyze-apis/), which is the process of converting unstructured text into individual tokens (usually words) that are optimized for search. + +The Analyze API analyzes a text string and returns the resulting tokens. + +If you use the Security plugin, you must have the `manage index` privilege. If you only want to analyze text, you must have the `manage cluster` privilege. +{: .note} + +## Path and HTTP methods + +``` +GET /_analyze +GET /{index}/_analyze +POST /_analyze +POST /{index}/_analyze +``` + +Although you can issue an analyze request using both `GET` and `POST` requests, the two have important distinctions. A `GET` request causes data to be cached in the index so that the next time the data is requested, it is retrieved faster. A `POST` request sends a string that does not already exist to the analyzer to be compared with data that is already in the index. `POST` requests are not cached. +{: .note} + +## Path parameter + +You can include the following optional path parameter in your request. + +Parameter | Data type | Description +:--- | :--- | :--- +index | String | Index that is used to derive the analyzer. + +## Query parameters + +You can include the following optional query parameters in your request. + +Field | Data type | Description +:--- | :--- | :--- +analyzer | String | The name of the analyzer to apply to the `text` field. The analyzer can be built in or configured in the index.

If `analyzer` is not specified, the analyze API uses the analyzer defined in the mapping of the `field` field.

If the `field` field is not specified, the analyze API uses the default analyzer for the index.

If no index is specified or the index does not have a default analyzer, the analyze API uses the standard analyzer. +attributes | Array of Strings | Array of token attributes for filtering the output of the `explain` field. +char_filter | Array of Strings | Array of character filters for preprocessing characters before the `tokenizer` field. +explain | Boolean | If true, causes the response to include token attributes and additional details. Defaults to `false`. +field | String | Field for deriving the analyzer.

If you specify `field`, you must also specify the `index` path parameter.

If you specify the `analyzer` field, it overrides the value of `field`.

If you do not specify `field`, the analyze API uses the default analyzer for the index.

If you do not specify the `index` field, or the index does not have a default analyzer, the analyze API uses the standard analyzer. +filter | Array of Strings | Array of token filters to apply after the `tokenizer` field. +normalizer | String | Normalizer for converting text into a single token. +tokenizer | String | Tokenizer for converting the `text` field into tokens. + +The following query parameter is required. + +Field | Data type | Description +:--- | :--- | :--- +text | String or Array of Strings | Text to analyze. If you provide an array of strings, the text is analyzed as a multi-value field. + +#### Example requests + +[Analyze array of text strings](#analyze-array-of-text-strings) + +[Apply a built-in analyzer](#apply-a-built-in-analyzer) + +[Apply a custom analyzer](#apply-a-custom-analyzer) + +[Apply a custom transient analyzer](#apply-a-custom-transient-analyzer) + +[Specify an index](#specify-an-index) + +[Derive the analyzer from an index field](#derive-the-analyzer-from-an-index-field) + +[Specify a normalizer](#specify-a-normalizer) + +[Get token details](#get-token-details) + +[Set a token limit](#set-a-token-limit) + +#### Analyze array of text strings + +When you pass an array of strings to the `text` field, it is analyzed as a multi-value field. + +````json +GET /_analyze +{ + "analyzer" : "standard", + "text" : ["first array element", "second array element"] +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json +{ + "tokens" : [ + { + "token" : "first", + "start_offset" : 0, + "end_offset" : 5, + "type" : "", + "position" : 0 + }, + { + "token" : "array", + "start_offset" : 6, + "end_offset" : 11, + "type" : "", + "position" : 1 + }, + { + "token" : "element", + "start_offset" : 12, + "end_offset" : 19, + "type" : "", + "position" : 2 + }, + { + "token" : "second", + "start_offset" : 20, + "end_offset" : 26, + "type" : "", + "position" : 3 + }, + { + "token" : "array", + "start_offset" : 27, + "end_offset" : 32, + "type" : "", + "position" : 4 + }, + { + "token" : "element", + "start_offset" : 33, + "end_offset" : 40, + "type" : "", + "position" : 5 + } + ] +} +```` + +#### Apply a built-in analyzer + +If you omit the `index` path parameter, you can apply any of the built-in analyzers to the text string. + +The following request analyzes text using the `standard` built-in analyzer: + +````json +GET /_analyze +{ + "analyzer" : "standard", + "text" : "OpenSearch text analysis" +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json +{ + "tokens" : [ + { + "token" : "opensearch", + "start_offset" : 0, + "end_offset" : 10, + "type" : "", + "position" : 0 + }, + { + "token" : "text", + "start_offset" : 11, + "end_offset" : 15, + "type" : "", + "position" : 1 + }, + { + "token" : "analysis", + "start_offset" : 16, + "end_offset" : 24, + "type" : "", + "position" : 2 + } + ] +} +```` + +#### Apply a custom analyzer + +You can create your own analyzer and specify it in an analyze request. + +In this scenario, a custom analyzer `lowercase_ascii_folding` has been created and associated with the `books2` index. The analyzer converts text to lowercase and converts non-ASCII characters to ASCII. + +The following request applies the custom analyzer to the provided text: + +````json +GET /books2/_analyze +{ + "analyzer": "lowercase_ascii_folding", + "text" : "Le garçon m'a SUIVI." +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json +{ + "tokens" : [ + { + "token" : "le", + "start_offset" : 0, + "end_offset" : 2, + "type" : "", + "position" : 0 + }, + { + "token" : "garcon", + "start_offset" : 3, + "end_offset" : 9, + "type" : "", + "position" : 1 + }, + { + "token" : "m'a", + "start_offset" : 10, + "end_offset" : 13, + "type" : "", + "position" : 2 + }, + { + "token" : "suivi", + "start_offset" : 14, + "end_offset" : 19, + "type" : "", + "position" : 3 + } + ] +} +```` + +#### Apply a custom transient analyzer + +You can build a custom transient analyzer from tokenizers, token filters, or character filters. Use the `filter` parameter to specify token filters. + +The following request uses the `uppercase` character filter to convert the text to uppercase: + +````json +GET /_analyze +{ + "tokenizer" : "keyword", + "filter" : ["uppercase"], + "text" : "OpenSearch filter" +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json +{ + "tokens" : [ + { + "token" : "OPENSEARCH FILTER", + "start_offset" : 0, + "end_offset" : 17, + "type" : "word", + "position" : 0 + } + ] +} +```` +
+ +The following request uses the `html_strip` filter to remove HTML characters from the text: + +````json +GET /_analyze +{ + "tokenizer" : "keyword", + "filter" : ["lowercase"], + "char_filter" : ["html_strip"], + "text" : "Leave right now!" +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +```` json +{ + "tokens" : [ + { + "token" : "leave right now!", + "start_offset" : 3, + "end_offset" : 23, + "type" : "word", + "position" : 0 + } + ] +} +```` + +
+ +You can combine filters using an array. + +The following request combines a `lowercase` translation with a `stop` filter that removes the words in the `stopwords` array: + +````json +GET /_analyze +{ + "tokenizer" : "whitespace", + "filter" : ["lowercase", {"type": "stop", "stopwords": [ "to", "in"]}], + "text" : "how to train your dog in five steps" +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json +{ + "tokens" : [ + { + "token" : "how", + "start_offset" : 0, + "end_offset" : 3, + "type" : "word", + "position" : 0 + }, + { + "token" : "train", + "start_offset" : 7, + "end_offset" : 12, + "type" : "word", + "position" : 2 + }, + { + "token" : "your", + "start_offset" : 13, + "end_offset" : 17, + "type" : "word", + "position" : 3 + }, + { + "token" : "dog", + "start_offset" : 18, + "end_offset" : 21, + "type" : "word", + "position" : 4 + }, + { + "token" : "five", + "start_offset" : 25, + "end_offset" : 29, + "type" : "word", + "position" : 6 + }, + { + "token" : "steps", + "start_offset" : 30, + "end_offset" : 35, + "type" : "word", + "position" : 7 + } + ] +} +```` + +#### Specify an index + +You can analyze text using an index's default analyzer, or you can specify a different analyzer. + +The following request analyzes the provided text using the default analyzer associated with the `books` index: + +````json +GET /books/_analyze +{ + "text" : "OpenSearch analyze test" +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json + + "tokens" : [ + { + "token" : "opensearch", + "start_offset" : 0, + "end_offset" : 10, + "type" : "", + "position" : 0 + }, + { + "token" : "analyze", + "start_offset" : 11, + "end_offset" : 18, + "type" : "", + "position" : 1 + }, + { + "token" : "test", + "start_offset" : 19, + "end_offset" : 23, + "type" : "", + "position" : 2 + } + ] +} +```` + +
+ +The following request analyzes the provided text using the `keyword` analyzer, which returns the entire text value as a single token: + +````json +GET /books/_analyze +{ + "analyzer" : "keyword", + "text" : "OpenSearch analyze test" +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json +{ + "tokens" : [ + { + "token" : "OpenSearch analyze test", + "start_offset" : 0, + "end_offset" : 23, + "type" : "word", + "position" : 0 + } + ] +} +```` + +#### Derive the analyzer from an index field + +You can pass text and a field in the index. The API looks up the field's analyzer and uses it to analyze the text. + +If the mapping does not exist, the API uses the standard analyzer, which converts all text to lowercase and tokenizes based on white space. + +The following request causes the analysis to be based on the mapping for `name`: + +````json +GET /books2/_analyze +{ + "field" : "name", + "text" : "OpenSearch analyze test" +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json +{ + "tokens" : [ + { + "token" : "opensearch", + "start_offset" : 0, + "end_offset" : 10, + "type" : "", + "position" : 0 + }, + { + "token" : "analyze", + "start_offset" : 11, + "end_offset" : 18, + "type" : "", + "position" : 1 + }, + { + "token" : "test", + "start_offset" : 19, + "end_offset" : 23, + "type" : "", + "position" : 2 + } + ] +} +```` + +#### Specify a normalizer + +Instead of using a keyword field, you can use the normalizer associated with the index. A normalizer causes the analysis change to produce a single token. + +In this example, the `books2` index includes a normalizer called `to_lower_fold_ascii` that converts text to lowercase and translates non-ASCII text to ASCII. + +The following request applies `to_lower_fold_ascii` to the text: + +````json +GET /books2/_analyze +{ + "normalizer" : "to_lower_fold_ascii", + "text" : "C'est le garçon qui m'a suivi." +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json +{ + "tokens" : [ + { + "token" : "c'est le garcon qui m'a suivi.", + "start_offset" : 0, + "end_offset" : 30, + "type" : "word", + "position" : 0 + } + ] +} +```` + +
+ +You can create a custom transient normalizer with token and character filters. + +The following request uses the `uppercase` character filter to convert the given text to all uppercase: + +````json +GET /_analyze +{ + "filter" : ["uppercase"], + "text" : "That is the boy who followed me." +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json +{ + "tokens" : [ + { + "token" : "THAT IS THE BOY WHO FOLLOWED ME.", + "start_offset" : 0, + "end_offset" : 32, + "type" : "word", + "position" : 0 + } + ] +} +```` + +#### Get token details + +You can obtain additional details for all tokens by setting the `explain` attribute to `true`. + +The following request provides detailed token information for the `reverse` filter used with the `standard` tokenizer: + +````json +GET /_analyze +{ + "tokenizer" : "standard", + "filter" : ["reverse"], + "text" : "OpenSearch analyze test", + "explain" : true, + "attributes" : ["keyword"] +} +```` +{% include copy-curl.html %} + +The previous request returns the following fields: + +````json +{ + "detail" : { + "custom_analyzer" : true, + "charfilters" : [ ], + "tokenizer" : { + "name" : "standard", + "tokens" : [ + { + "token" : "OpenSearch", + "start_offset" : 0, + "end_offset" : 10, + "type" : "", + "position" : 0 + }, + { + "token" : "analyze", + "start_offset" : 11, + "end_offset" : 18, + "type" : "", + "position" : 1 + }, + { + "token" : "test", + "start_offset" : 19, + "end_offset" : 23, + "type" : "", + "position" : 2 + } + ] + }, + "tokenfilters" : [ + { + "name" : "reverse", + "tokens" : [ + { + "token" : "hcraeSnepO", + "start_offset" : 0, + "end_offset" : 10, + "type" : "", + "position" : 0 + }, + { + "token" : "ezylana", + "start_offset" : 11, + "end_offset" : 18, + "type" : "", + "position" : 1 + }, + { + "token" : "tset", + "start_offset" : 19, + "end_offset" : 23, + "type" : "", + "position" : 2 + } + ] + } + ] + } +} +```` + +#### Set a token limit + +You can set a limit to the number of tokens generated. Setting a lower value reduces a node's memory usage. The default value is 10000. + +The following request limits the tokens to four: + +````json +PUT /books2 +{ + "settings" : { + "index.analyze.max_token_count" : 4 + } +} +```` +{% include copy-curl.html %} + +The preceding request is an index API rather than an analyze API. See [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings) for additional details. +{: .note} + +### Response fields + +The text analysis endpoints return the following response fields. + +Field | Data type | Description +:--- | :--- | :--- +tokens | Array | Array of tokens derived from the `text`. See [token object](#token-object). +detail | Object | Details about the analysis and each token. Included only when you request token details. See [detail object](#detail-object). + +#### Token object + +Field | Data type | Description +:--- | :--- | :--- +token | String | The token's text. +start_offset | Integer | The token's starting position within the original text string. Offsets are zero-based. +end_offset | Integer | The token's ending position within the original text string. +type | String | Classification of the token: ``, ``, and so on. The tokenizer usually sets the type, but some filters define their own types. For example, the synonym filter defines the `` type. +position | Integer | The token's position within the `tokens` array. + +#### Detail object + +Field | Data type | Description +:--- | :--- | :--- +custom_analyzer | Boolean | Whether the analyzer applied to the text is custom or built in. +charfilters | Array | List of character filters applied to the text. +tokenizer | Object | Name of the tokenizer applied to the text and a list of tokens* with content before the token filters were applied. +tokenfilters | Array | List of token filters applied to the text. Each token filter includes the filter's name and a list of tokens* with content after the filters were applied. Token filters are listed in the order they are specified in the request. + +See [token object](#token-object) for token field descriptions. +{: .note} \ No newline at end of file diff --git a/_api-reference/analyze-apis/terminology.md b/_api-reference/analyze-apis/terminology.md new file mode 100644 index 00000000..17d26308 --- /dev/null +++ b/_api-reference/analyze-apis/terminology.md @@ -0,0 +1,37 @@ +--- +layout: default +title: Analysis API Terminology +parent: Analyze API + +nav_order: 1 +--- + +# Terminology + +The following sections provide descriptions of important text analysis terms. + +## Analyzers + +Analyzers tell OpenSearch how to index and search text. An analyzer is composed of three components: a tokenizer, zero or more token filters, and zero or more character filters. + +OpenSearch provides *built-in* analyzers. For example, the `standard` built-in analyzer converts text to lowercase and breaks text into tokens based on word boundaries such as carriage returns and white space. The `standard` analyzer is also called the *default* analyzer and is used when no analyzer is specified in the text analysis request. + +If needed, you can combine tokenizers, token filters, and character filters to create a *custom* analyzer. + +#### Tokenizers + +Tokenizers break unstructured text into tokens and maintain metadata about tokens, such as their starting and ending positions in the text. + +#### Character filters + +Character filters examine text and perform translations, such as changing, removing, and adding characters. + +#### Token filters + +Token filters modify tokens, performing operations such as converting a token's characters to uppercase and adding or removing tokens. + +## Normalizers + +Similar to analyzers, normalizers tokenize text but return a single token only. Normalizers do not employ tokenizers; they make limited use of character and token filters, such as those that operate on one character at a time. + +By default, OpenSearch does not apply normalizers. To apply normalizers, you must add them to your data before creating an index. \ No newline at end of file diff --git a/_opensearch/rest-api/cat/cat-aliases.md b/_api-reference/cat/cat-aliases.md similarity index 69% rename from _opensearch/rest-api/cat/cat-aliases.md rename to _api-reference/cat/cat-aliases.md index 500a0ce0..9e4407dc 100644 --- a/_opensearch/rest-api/cat/cat-aliases.md +++ b/_api-reference/cat/cat-aliases.md @@ -1,35 +1,40 @@ --- layout: default -title: cat aliases -parent: CAT -grand_parent: REST API reference +title: CAT aliases +parent: CAT API +redirect_from: +- /opensearch/rest-api/cat/cat-aliases/ + nav_order: 1 has_children: false --- -# cat aliases -Introduced 1.0 +# CAT aliases +**Introduced 1.0** {: .label .label-purple } -The cat aliases operation lists the mapping of aliases to indices, plus routing and filtering information. +The CAT aliases operation lists the mapping of aliases to indexes, plus routing and filtering information. ## Example ```json GET _cat/aliases?v ``` +{% include copy-curl.html %} To limit the information to a specific alias, add the alias name after your query: ```json GET _cat/aliases/?v ``` +{% include copy-curl.html %} If you want to get information for more than one alias, separate the alias names with commas: ```json GET _cat/aliases/alias1,alias2,alias3 ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -41,14 +46,14 @@ GET _cat/aliases ## URL parameters -All cat aliases URL parameters are optional. +All CAT aliases URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameters: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -expand_wildcards | Enum | Expands wildcard expressions to concrete indices. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. +expand_wildcards | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. ## Response @@ -57,7 +62,7 @@ The following response shows that `alias1` refers to a `movies` index and has a ```json alias | index | filter | routing.index | routing.search | is_write_index alias1 | movies | * | - | - | - -.kibana | .kibana_1 | - | - | - | - +.opensearch-dashboards | .opensearch-dashboards_1 | - | - | - | - ``` To learn more about index aliases, see [Index aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias). diff --git a/_opensearch/rest-api/cat/cat-allocation.md b/_api-reference/cat/cat-allocation.md similarity index 59% rename from _opensearch/rest-api/cat/cat-allocation.md rename to _api-reference/cat/cat-allocation.md index 6c5c0aa7..9598c8f3 100644 --- a/_opensearch/rest-api/cat/cat-allocation.md +++ b/_api-reference/cat/cat-allocation.md @@ -1,35 +1,39 @@ --- layout: default -title: cat allocation -parent: CAT -grand_parent: REST API reference +title: CAT allocation +parent: CAT API +redirect_from: +- /opensearch/rest-api/cat/cat-allocation/ nav_order: 5 has_children: false --- -# cat allocation -Introduced 1.0 +# CAT allocation +**Introduced 1.0** {: .label .label-purple } -The cat allocation operation lists the allocation of disk space for indices and the number of shards on each node. +The CAT allocation operation lists the allocation of disk space for indexes and the number of shards on each node. ## Example ```json GET _cat/allocation?v ``` +{% include copy-curl.html %} To limit the information to a specific node, add the node name after your query: ```json GET _cat/allocation/ ``` +{% include copy-curl.html %} If you want to get information for more than one node, separate the node names with commas: ```json -GET _cat/aliases/node_name_1,node_name_2,node_name_3 +GET _cat/allocation/node_name_1,node_name_2,node_name_3 ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -40,21 +44,19 @@ GET _cat/allocation/ ## URL parameters -All cat allocation URL parameters are optional. +All CAT allocation URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameters: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. - - +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster_manager node. Default is 30 seconds. ## Response -The following response shows that 8 shards are allocated to each the two nodes available: +The following response shows that eight shards are allocated to each of the two nodes available: ```json shards | disk.indices | disk.used | disk.avail | disk.total | disk.percent host | ip | node diff --git a/_api-reference/cat/cat-cluster_manager.md b/_api-reference/cat/cat-cluster_manager.md new file mode 100644 index 00000000..abf204ce --- /dev/null +++ b/_api-reference/cat/cat-cluster_manager.md @@ -0,0 +1,45 @@ +--- +layout: default +title: CAT cluster manager +parent: CAT API +redirect_from: + - /opensearch/rest-api/cat/cat-master/ +nav_order: 30 +has_children: false +--- + +# CAT cluster_manager +**Introduced 1.0** +{: .label .label-purple } + +The CAT cluster manager operation lists information that helps identify the elected cluster manager node. + +## Example + +``` +GET _cat/cluster_manager?v +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +GET _cat/cluster_manager +``` + +## URL parameters + +All CAT cluster manager URL parameters are optional. + +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: + +Parameter | Type | Description +:--- | :--- | :--- +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. + +## Response + +```json +id | host | ip | node +ZaIkkUd4TEiAihqJGkp5CA | 172.18.0.3 | 172.18.0.3 | opensearch-node2 +``` diff --git a/_opensearch/rest-api/cat/cat-count.md b/_api-reference/cat/cat-count.md similarity index 57% rename from _opensearch/rest-api/cat/cat-count.md rename to _api-reference/cat/cat-count.md index 69ddf279..34baa04d 100644 --- a/_opensearch/rest-api/cat/cat-count.md +++ b/_api-reference/cat/cat-count.md @@ -1,35 +1,40 @@ --- layout: default -title: cat count -parent: CAT -grand_parent: REST API reference +title: CAT count +parent: CAT API nav_order: 10 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-count/ + --- -# cat count -Introduced 1.0 +# CAT count +**Introduced 1.0** {: .label .label-purple } -The cat count operation lists the number of documents in your cluster. +The CAT count operation lists the number of documents in your cluster. ## Example ```json GET _cat/count?v ``` +{% include copy-curl.html %} To see the number of documents in a specific index or alias, add the index or alias name after your query: ```json GET _cat/count/?v ``` +{% include copy-curl.html %} If you want to get information for more than one index or alias, separate the index or alias names with commas: ```json -GET _cat/aliases/index_or_alias_1,index_or_alias_2,index_or_alias_3 +GET _cat/count/index_or_alias_1,index_or_alias_2,index_or_alias_3 ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -40,7 +45,7 @@ GET _cat/count/?v ## URL parameters -All cat count URL parameters are optional. You can specify any of the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters). +All CAT count URL parameters are optional. You can specify any of the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index). ## Response diff --git a/_opensearch/rest-api/cat/cat-field-data.md b/_api-reference/cat/cat-field-data.md similarity index 70% rename from _opensearch/rest-api/cat/cat-field-data.md rename to _api-reference/cat/cat-field-data.md index d86d17a1..6481e5ce 100644 --- a/_opensearch/rest-api/cat/cat-field-data.md +++ b/_api-reference/cat/cat-field-data.md @@ -1,35 +1,39 @@ --- layout: default -title: cat field data -parent: CAT -grand_parent: REST API reference +title: CAT field data +parent: CAT API nav_order: 15 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-field-data/ --- -# cat fielddata -Introduced 1.0 +# CAT fielddata +**Introduced 1.0** {: .label .label-purple } -The cat fielddata operation lists the memory size used by each field per node. +The CAT fielddata operation lists the memory size used by each field per node. ## Example ```json GET _cat/fielddata?v ``` +{% include copy-curl.html %} To limit the information to a specific field, add the field name after your query: ```json GET _cat/fielddata/?v ``` +{% include copy-curl.html %} If you want to get information for more than one field, separate the field names with commas: ```json -GET _cat/aliases/field_name_1,field_name_2,field_name_3 +GET _cat/fielddata/field_name_1,field_name_2,field_name_3 ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -40,9 +44,9 @@ GET _cat/fielddata/?v ## URL parameters -All cat fielddata URL parameters are optional. +All CAT fielddata URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameter: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameter: Parameter | Type | Description :--- | :--- | :--- diff --git a/_opensearch/rest-api/cat/cat-health.md b/_api-reference/cat/cat-health.md similarity index 76% rename from _opensearch/rest-api/cat/cat-health.md rename to _api-reference/cat/cat-health.md index 476681f8..6077c77e 100644 --- a/_opensearch/rest-api/cat/cat-health.md +++ b/_api-reference/cat/cat-health.md @@ -1,33 +1,37 @@ --- layout: default -title: cat health -parent: CAT -grand_parent: REST API reference +title: CAT health +parent: CAT API + nav_order: 20 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-health/ --- -# cat health -Introduced 1.0 +# CAT health +**Introduced 1.0** {: .label .label-purple } -The cat health operation lists the status of the cluster, how long the cluster has been up, the number of nodes, and other useful information that helps you analyze the health of your cluster. +The CAT health operation lists the status of the cluster, how long the cluster has been up, the number of nodes, and other useful information that helps you analyze the health of your cluster. ## Example ```json GET _cat/health?v ``` +{% include copy-curl.html %} ## Path and HTTP methods ``` GET _cat/health?v ``` +{% include copy-curl.html %} ## URL parameters -All cat health URL parameters are optional. +All CAT health URL parameters are optional. Parameter | Type | Description :--- | :--- | :--- diff --git a/_opensearch/rest-api/cat/cat-indices.md b/_api-reference/cat/cat-indices.md similarity index 64% rename from _opensearch/rest-api/cat/cat-indices.md rename to _api-reference/cat/cat-indices.md index 0bd4a2e9..3a21e900 100644 --- a/_opensearch/rest-api/cat/cat-indices.md +++ b/_api-reference/cat/cat-indices.md @@ -1,35 +1,39 @@ --- layout: default -title: cat indices -parent: CAT -grand_parent: REST API reference +title: CAT indices operation +parent: CAT API nav_order: 25 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-indices/ --- -# cat indices -Introduced 1.0 +# CAT indices +**Introduced 1.0** {: .label .label-purple } -The cat indices operation lists information related to indices⁠—how much disk space they are using, how many shards they have, their health status, and so on. +The CAT indices operation lists information related to indexes, that is, how much disk space they are using, how many shards they have, their health status, and so on. ## Example ``` GET _cat/indices?v ``` +{% include copy-curl.html %} To limit the information to a specific index, add the index name after your query. ``` GET _cat/indices/?v ``` +{% include copy-curl.html %} -If you want to get information for more than one index, separate the indices with commas: +If you want to get information for more than one index, separate the indexes with commas: ```json -GET _cat/aliases/index1,index2,index3 +GET _cat/indices/index1,index2,index3 ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -40,19 +44,19 @@ GET _cat/indices ## URL parameters -All cat indices URL parameters are optional. +All URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameters: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index/), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -health | String | Limit indices based on their health status. Supported values are `green`, `yellow`, and `red`. +health | String | Limit indexes based on their health status. Supported values are `green`, `yellow`, and `red`. include_unloaded_segments | Boolean | Whether to include information from segments not loaded into memory. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. pri | Boolean | Whether to return information only from the primary shards. Default is false. time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -expand_wildcards | Enum | Expands wildcard expressions to concrete indices. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. +expand_wildcards | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. ## Response diff --git a/_api-reference/cat/cat-nodeattrs.md b/_api-reference/cat/cat-nodeattrs.md new file mode 100644 index 00000000..95c1e50a --- /dev/null +++ b/_api-reference/cat/cat-nodeattrs.md @@ -0,0 +1,47 @@ +--- +layout: default +title: CAT nodeattrs +parent: CAT API +nav_order: 35 +has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-nodeattrs/ +--- + +# CAT nodeattrs +**Introduced 1.0** +{: .label .label-purple } + +The CAT nodeattrs operation lists the attributes of custom nodes. + +## Example + +``` +GET _cat/nodeattrs?v +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +GET _cat/nodeattrs +``` + +## URL parameters + +All CAT nodeattrs URL parameters are optional. + +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: + +Parameter | Type | Description +:--- | :--- | :--- +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. + + +## Response + +```json +node | host | ip | attr | value +odfe-node2 | 172.18.0.3 | 172.18.0.3 | testattr | test +``` diff --git a/_opensearch/rest-api/cat/cat-nodes.md b/_api-reference/cat/cat-nodes.md similarity index 51% rename from _opensearch/rest-api/cat/cat-nodes.md rename to _api-reference/cat/cat-nodes.md index d2587bc6..149e5905 100644 --- a/_opensearch/rest-api/cat/cat-nodes.md +++ b/_api-reference/cat/cat-nodes.md @@ -1,25 +1,27 @@ --- layout: default -title: cat nodes -parent: CAT -grand_parent: REST API reference +title: CAT nodes +parent: CAT API nav_order: 40 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-nodes/ --- -# cat nodes -Introduced 1.0 +# CAT nodes +**Introduced 1.0** {: .label .label-purple } -The cat nodes operation lists node-level information, including node roles and load metrics. +The CAT nodes operation lists node-level information, including node roles and load metrics. -A few important node metrics are `pid`, `name`, `master`, `ip`, `port`, `version`, `build`, `jdk`, along with `disk`, `heap`, `ram`, and `file_desc`. +A few important node metrics are `pid`, `name`, `cluster_manager`, `ip`, `port`, `version`, `build`, `jdk`, along with `disk`, `heap`, `ram`, and `file_desc`. ## Example ``` GET _cat/nodes?v ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -29,16 +31,16 @@ GET _cat/nodes ## URL parameters -All cat nodes URL parameters are optional. +All CAT nodes URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameters: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). full_id | Boolean | If true, return the full node ID. If false, return the shortened node ID. Defaults to false. -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). include_unloaded_segments | Boolean | Whether to include information from segments not loaded into memory. Default is false. @@ -46,7 +48,6 @@ include_unloaded_segments | Boolean | Whether to include information from segmen ## Response ```json -ip | heap.percent | ram.percent | cpu load_1m | load_5m | load_15m | node.role | master | name -172.18.0.3 | 31 | 97 | 3 | 0.03 | 0.10 | 0.14 dimr | * | odfe-node2 -172.18.0.4 | 45 | 97 | 3 | 0.19 | 0.14 | 0.15 dimr | - | odfe-node1 +ip | heap.percent | ram.percent | cpu load_1m | load_5m | load_15m | node.role | node.roles | cluster_manager | name +10.11.1.225 | 31 | 32 | 0 | 0.00 | 0.00 | di | data,ingest,ml | - | data-e5b89ad7 ``` diff --git a/_opensearch/rest-api/cat/cat-pending-tasks.md b/_api-reference/cat/cat-pending-tasks.md similarity index 56% rename from _opensearch/rest-api/cat/cat-pending-tasks.md rename to _api-reference/cat/cat-pending-tasks.md index 37cf82ac..c8e1b744 100644 --- a/_opensearch/rest-api/cat/cat-pending-tasks.md +++ b/_api-reference/cat/cat-pending-tasks.md @@ -1,23 +1,26 @@ --- layout: default -title: cat pending tasks -parent: CAT -grand_parent: REST API reference +title: CAT pending tasks +parent: CAT API + nav_order: 45 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-pending-tasks/ --- -# cat pending tasks -Introduced 1.0 +# CAT pending tasks +**Introduced 1.0** {: .label .label-purple } -The cat pending tasks operation lists the progress of all pending tasks, including task priority and time in queue. +The CAT pending tasks operation lists the progress of all pending tasks, including task priority and time in queue. ## Example ``` GET _cat/pending_tasks?v ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -27,17 +30,16 @@ GET _cat/pending_tasks ## URL parameters -All cat nodes URL parameters are optional. +All CAT nodes URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameters: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). - ## Response ```json diff --git a/_opensearch/rest-api/cat/cat-plugins.md b/_api-reference/cat/cat-plugins.md similarity index 74% rename from _opensearch/rest-api/cat/cat-plugins.md rename to _api-reference/cat/cat-plugins.md index c4982914..34984622 100644 --- a/_opensearch/rest-api/cat/cat-plugins.md +++ b/_api-reference/cat/cat-plugins.md @@ -1,23 +1,26 @@ --- layout: default -title: cat plugins -parent: CAT -grand_parent: REST API reference +title: CAT plugins +parent: CAT API + nav_order: 50 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-plugins/ --- -# cat plugins -Introduced 1.0 +# CAT plugins +**Introduced 1.0** {: .label .label-purple } -The cat plugins operation lists the names, components, and versions of the installed plugins. +The CAT plugins operation lists the names, components, and versions of the installed plugins. ## Example ``` GET _cat/plugins?v ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -27,15 +30,14 @@ GET _cat/plugins ## URL parameters -All cat plugins URL parameters are optional. +All CAT plugins URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameters: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. - +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster_manager node. Default is 30 seconds. ## Response diff --git a/_opensearch/rest-api/cat/cat-recovery.md b/_api-reference/cat/cat-recovery.md similarity index 78% rename from _opensearch/rest-api/cat/cat-recovery.md rename to _api-reference/cat/cat-recovery.md index 548456c0..54abac6d 100644 --- a/_opensearch/rest-api/cat/cat-recovery.md +++ b/_api-reference/cat/cat-recovery.md @@ -1,35 +1,40 @@ --- layout: default -title: cat recovery -parent: CAT -grand_parent: REST API reference +title: CAT recovery +parent: CAT API + nav_order: 50 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-recovery/ --- -# cat recovery -Introduced 1.0 +# CAT recovery +**Introduced 1.0** {: .label .label-purple } -The cat recovery operation lists all completed and ongoing index and shard recoveries. +The CAT recovery operation lists all completed and ongoing index and shard recoveries. ## Example ``` GET _cat/recovery?v ``` +{% include copy-curl.html %} To see only the recoveries of a specific index, add the index name after your query. ``` GET _cat/recovery/?v ``` +{% include copy-curl.html %} -If you want to get information for more than one index, separate the indices with commas: +If you want to get information for more than one index, separate the indexes with commas: ```json -GET _cat/aliases/index1,index2,index3 +GET _cat/recovery/index1,index2,index3 ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -39,9 +44,9 @@ GET _cat/recovery ## URL parameters -All cat recovery URL parameters are optional. +All CAT recovery URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameters: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- diff --git a/_api-reference/cat/cat-repositories.md b/_api-reference/cat/cat-repositories.md new file mode 100644 index 00000000..94f39b9d --- /dev/null +++ b/_api-reference/cat/cat-repositories.md @@ -0,0 +1,49 @@ +--- +layout: default +title: CAT repositories +parent: CAT API + +nav_order: 52 +has_children: false +redirect_from: + - /opensearch/rest-api/cat/cat-repositories/ +--- + +# CAT repositories +**Introduced 1.0** +{: .label .label-purple } + +The CAT repositories operation lists all snapshot repositories for a cluster. + +## Example + +``` +GET _cat/repositories?v +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +GET _cat/repositories +``` + +## URL parameters + +All CAT repositories URL parameters are optional. + +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: + +Parameter | Type | Description +:--- | :--- | :--- +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster_manager node. Default is 30 seconds. + + +## Response + +```json +id type +repo1 fs +repo2 s3 +``` diff --git a/_api-reference/cat/cat-segment-replication.md b/_api-reference/cat/cat-segment-replication.md new file mode 100644 index 00000000..e22012ea --- /dev/null +++ b/_api-reference/cat/cat-segment-replication.md @@ -0,0 +1,167 @@ +--- +layout: default +title: CAT segment replication +parent: CAT API +nav_order: 53 +has_children: false +--- + +# CAT segment replication +**Introduced 2.7** +{: .label .label-purple } + +The CAT segment replication operation returns information about active and last completed [segment replication]({{site.url}}{{site.baseurl}}/opensearch/segment-replication/index) events on each replica shard, including related shard-level metrics. These metrics provide information about how far behind the primary shard the replicas are lagging. + +Call the CAT Segment Replication API only on indexes with segment replication enabled. +{: .note} + +## Path and HTTP methods + +```json +GET /_cat/segment_replication +GET /_cat/segment_replication/ +``` + +## Path parameters + +The following table lists the available optional path parameter. + +Parameter | Type | Description +:--- | :--- | :--- +`index` | String | The name of the index, or a comma-separated list or wildcard expression of index names used to filter results. If this parameter is not provided, the response contains information about all indexes in the cluster. + +## Query parameters + +The CAT segment replication API operation supports the following optional query parameters. + +Parameter | Data type | Description +:--- |:-----------| :--- +`active_only` | Boolean | If `true`, the response only includes active segment replications. Defaults to `false`. +[`detailed`](#additional-detailed-response-metrics) | String | If `true`, the response includes additional metrics for each stage of a segment replication event. Defaults to `false`. +`shards` | String | A comma-separated list of shards to display. +`bytes` | Byte units | [Units]({{site.url}}{{site.baseurl}}/opensearch/units/) used to display byte size values. +`format` | String | A short version of the HTTP accept header. Valid values include `JSON` and `YAML`. +`h` | String | A comma-separated list of column names to display. +`help` | Boolean | If `true`, the response includes help information. Defaults to `false`. +`time` | Time units | [Units]({{site.url}}{{site.baseurl}}/opensearch/units/) used to display time values. +`v` | Boolean | If `true`, the response includes column headings. Defaults to `false`. +`s` | String | Specifies to sort the results. For example, `s=shardId:desc` sorts by shardId in descending order. + +## Example + +The following examples illustrate various segment replication responses. + +#### Example 1: No active segment replication events + +The following query requests segment replication metrics with column headings for all indexes: + +```json +GET /_cat/segment_replication?v=true +``` +{% include copy-curl.html %} + +The response contains the metrics for the preceding request: + +```bash +shardId target_node target_host checkpoints_behind bytes_behind current_lag last_completed_lag rejected_requests +[index-1][0] runTask-1 127.0.0.1 0 0b 0s 7ms 0 +``` + +#### Example 2: Shard ID specified + +The following query requests segment replication metrics with column headings for shards with the ID `0` from indexes `index1` and `index2`: + +```json +GET /_cat/segment_replication/index1,index2?v=true&shards=0 +``` +{% include copy-curl.html %} + +The response contains the metrics for the preceding request. The column headings correspond to the metric names: + +```bash +shardId target_node target_host checkpoints_behind bytes_behind current_lag last_completed_lag rejected_requests +[index-1][0] runTask-1 127.0.0.1 0 0b 0s 3ms 0 +[index-2][0] runTask-1 127.0.0.1 0 0b 0s 5ms 0 +``` + +#### Example 3: Detailed response + +The following query requests detailed segment replication metrics with column headings for all indexes: + +```json +GET /_cat/segment_replication?v=true&detailed=true +``` +{% include copy-curl.html %} + +The response contains additional metrics about the files and stages of a segment replication event: + +```bash +shardId target_node target_host checkpoints_behind bytes_behind current_lag last_completed_lag rejected_requests stage time files_fetched files_percent bytes_fetched bytes_percent start_time stop_time files files_total bytes bytes_total replicating_stage_time_taken get_checkpoint_info_stage_time_taken file_diff_stage_time_taken get_files_stage_time_taken finalize_replication_stage_time_taken +[index-1][0] runTask-1 127.0.0.1 0 0b 0s 3ms 0 done 10ms 6 100.0% 4753 100.0% 2023-03-16T13:46:16.802Z 2023-03-16T13:46:16.812Z 6 6 4.6kb 4.6kb 0s 2ms 0s 3ms 3ms +[index-2][0] runTask-1 127.0.0.1 0 0b 0s 5ms 0 done 7ms 3 100.0% 3664 100.0% 2023-03-16T13:53:33.466Z 2023-03-16T13:53:33.474Z 3 3 3.5kb 3.5kb 0s 1ms 0s 2ms 2ms +``` + +#### Example 4: Sorting the results + +The following query requests segment replication metrics with column headings for all indexes, sorted by shard ID in descending order: + +```json +GET /_cat/segment_replication?v&s=shardId:desc +``` +{% include copy-curl.html %} + +The response contains the sorted results: + +```bash +shardId target_node target_host checkpoints_behind bytes_behind current_lag last_completed_lag rejected_requests +[test6][1] runTask-2 127.0.0.1 0 0b 0s 5ms 0 +[test6][0] runTask-2 127.0.0.1 0 0b 0s 4ms 0 +``` + +#### Example 5: Using a metric alias + +In a request, you can either use a metric's full name or one of its aliases. The following query is the same as the preceding query, but it uses the alias `s` instead of `shardID` for sorting: + +```json +GET /_cat/segment_replication?v&s=s:desc +``` +{% include copy-curl.html %} + +## Response metrics + +The following table lists the response metrics that are returned for all requests. When referring to a metric in a query parameter, you can provide either the metric's full name or any of its aliases, as shown in the previous [example](#example-5-using-a-metric-alias). + +Metric | Alias | Description +:--- | :--- | :--- +`shardId` | `s` | The ID of a specific shard. +`target_host` | `thost` | The target host IP address. +`target_node` | `tnode` | The target node name. +`checkpoints_behind` | `cpb` | The number of checkpoints by which the replica shard is behind the primary shard. +`bytes_behind` | `bb` | The number of bytes by which the replica shard is behind the primary shard. +`current_lag` | `clag` | The time elapsed while waiting for a replica shard to catch up to the primary shard. +`last_completed_lag` | `lcl` | The time taken for a replica shard to catch up to the latest primary shard refresh. +`rejected_requests` | `rr` | The number of rejected requests for the replication group. + +### Additional detailed response metrics + +The following table lists the additional response fields returned if `detailed` is set to `true`. + +Metric | Alias | Description +:--- |:--- |:--- +`stage` | `st` | The current stage of a segment replication event. +`time` | `t`, `ti` | The amount of time a segment replication event took to complete, in milliseconds. +`files_fetched` | `ff` | The number of files fetched so far for a segment replication event. +`files_percent` | `fp` | The percentage of files fetched so far for a segment replication event. +`bytes_fetched` | `bf` | The number of bytes fetched so far for a segment replication event. +`bytes_percent` | `bp` | The number of bytes fetched so far for a segment replication event as a percentage. +`start_time` | `start` | The segment replication start time. +`stop_time` | `stop` | The segment replication stop time. +`files` | `f` | The number of files that needs to be fetched for a segment replication event. +`files_total` | `tf` | The total number of files that are part of this recovery, including both reused and recovered files. +`bytes` | `b` | The number of bytes that needs to be fetched for a segment replication event. +`bytes_total` | `tb` | The total number of bytes in the shard. +`replicating_stage_time_taken` | `rstt` | The amount of time the `replicating` stage of a segment replication event took to complete. +`get_checkpoint_info_stage_time_taken` | `gcistt` | The amount of time the `get checkpoint info` stage of a segment replication event took to complete. +`file_diff_stage_time_taken` | `fdstt` | The amount of time the `file diff` stage of a segment replication event took to complete. +`get_files_stage_time_taken` | `gfstt` | The amount of time the `get files` stage of a segment replication event took to complete. +`finalize_replication_stage_time_taken` | `frstt` | The amount of time the `finalize replication` stage of a segment replication event took to complete. diff --git a/_opensearch/rest-api/cat/cat-segments.md b/_api-reference/cat/cat-segments.md similarity index 70% rename from _opensearch/rest-api/cat/cat-segments.md rename to _api-reference/cat/cat-segments.md index a5fce2e5..b8604866 100644 --- a/_opensearch/rest-api/cat/cat-segments.md +++ b/_api-reference/cat/cat-segments.md @@ -1,14 +1,16 @@ --- layout: default -title: cat segments -parent: CAT -grand_parent: REST API reference +title: CAT segments +parent: CAT API + nav_order: 55 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-segments/ --- -# cat segments -Introduced 1.0 +# CAT segments +**Introduced 1.0** {: .label .label-purple } The cat segments operation lists Lucene segment-level information for each index. @@ -18,18 +20,21 @@ The cat segments operation lists Lucene segment-level information for each index ``` GET _cat/segments?v ``` +{% include copy-curl.html %} To see only the information about segments of a specific index, add the index name after your query. ``` GET _cat/segments/?v ``` +{% include copy-curl.html %} -If you want to get information for more than one index, separate the indices with commas: +If you want to get information for more than one index, separate the indexes with commas: ``` GET _cat/segments/index1,index2,index3 ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -39,13 +44,14 @@ GET _cat/segments ## URL parameters -All cat segments URL parameters are optional. +All CAT segments URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameter: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/).. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. ## Response diff --git a/_opensearch/rest-api/cat/cat-shards.md b/_api-reference/cat/cat-shards.md similarity index 69% rename from _opensearch/rest-api/cat/cat-shards.md rename to _api-reference/cat/cat-shards.md index 00d4b554..e74667b5 100644 --- a/_opensearch/rest-api/cat/cat-shards.md +++ b/_api-reference/cat/cat-shards.md @@ -1,35 +1,40 @@ --- layout: default -title: cat shards -parent: CAT -grand_parent: REST API reference +title: CAT shards +parent: CAT API + nav_order: 60 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-shards/ --- -# cat shards -Introduced 1.0 +# CAT shards +**Introduced 1.0** {: .label .label-purple } -The cat shards operation lists the state of all primary and replica shards and how they are distributed. +The CAT shards operation lists the state of all primary and replica shards and how they are distributed. ## Example ``` GET _cat/shards?v ``` +{% include copy-curl.html %} To see only the information about shards of a specific index, add the index name after your query. ``` GET _cat/shards/?v ``` +{% include copy-curl.html %} -If you want to get information for more than one index, separate the indices with commas: +If you want to get information for more than one index, separate the indexes with commas: ``` GET _cat/shards/index1,index2,index3 ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -41,13 +46,13 @@ GET _cat/shards All cat shards URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameter: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster_manager node. Default is 30 seconds. time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). diff --git a/_opensearch/rest-api/cat/cat-snapshots.md b/_api-reference/cat/cat-snapshots.md similarity index 60% rename from _opensearch/rest-api/cat/cat-snapshots.md rename to _api-reference/cat/cat-snapshots.md index 71aa30cf..82cb5c1b 100644 --- a/_opensearch/rest-api/cat/cat-snapshots.md +++ b/_api-reference/cat/cat-snapshots.md @@ -1,23 +1,26 @@ --- layout: default -title: cat snapshots -parent: CAT -grand_parent: REST API reference +title: CAT snapshots +parent: CAT API + nav_order: 65 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-snapshots/ --- -# cat snapshots -Introduced 1.0 +# CAT snapshots +**Introduced 1.0** {: .label .label-purple } -The cat snapshots operation lists all snapshots for a repository. +The CAT snapshots operation lists all snapshots for a repository. ## Example ``` GET _cat/snapshots?v ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -27,13 +30,13 @@ GET _cat/snapshots ## URL parameters -All cat snapshots URL parameters are optional. +All CAT snapshots URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameter: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameter: Parameter | Type | Description :--- | :--- | :--- -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). diff --git a/_opensearch/rest-api/cat/cat-tasks.md b/_api-reference/cat/cat-tasks.md similarity index 79% rename from _opensearch/rest-api/cat/cat-tasks.md rename to _api-reference/cat/cat-tasks.md index 2d30836b..4d2a06cc 100644 --- a/_opensearch/rest-api/cat/cat-tasks.md +++ b/_api-reference/cat/cat-tasks.md @@ -1,23 +1,26 @@ --- layout: default -title: cat tasks -parent: CAT -grand_parent: REST API reference +title: CAT tasks +parent: CAT API + nav_order: 70 has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-tasks/ --- -# cat tasks -Introduced 1.0 +# CAT tasks +**Introduced 1.0** {: .label .label-purple } -The cat tasks operation lists the progress of all tasks currently running on your cluster. +The CAT tasks operation lists the progress of all tasks currently running on your cluster. ## Example ``` GET _cat/tasks?v ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -27,9 +30,9 @@ GET _cat/tasks ## URL parameters -All cat tasks URL parameters are optional. +All CAT tasks URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/index#common-url-parameters), you can specify the following parameter: +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- diff --git a/_api-reference/cat/cat-templates.md b/_api-reference/cat/cat-templates.md new file mode 100644 index 00000000..d2aed7b0 --- /dev/null +++ b/_api-reference/cat/cat-templates.md @@ -0,0 +1,58 @@ +--- +layout: default +title: CAT templates +parent: CAT API + +nav_order: 70 +has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-templates/ +--- + +# CAT templates +**Introduced 1.0** +{: .label .label-purple } + +The CAT templates operation lists the names, patterns, order numbers, and version numbers of index templates. + +## Example + +``` +GET _cat/templates?v +``` +{% include copy-curl.html %} + +If you want to get information for a specific template or pattern: + +``` +GET _cat/templates/ +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +GET _cat/templates +``` +{% include copy-curl.html %} + +## URL parameters + +All CAT templates URL parameters are optional. + +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: + +Parameter | Type | Description +:--- | :--- | :--- +local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. + + +## Response + +``` +name | index_patterns order version composed_of +tenant_template | [opensearch-dashboards*] | 0 | +``` + +To learn more about index templates, see [Index templates]({{site.url}}{{site.baseurl}}/opensearch/index-templates). diff --git a/_api-reference/cat/cat-thread-pool.md b/_api-reference/cat/cat-thread-pool.md new file mode 100644 index 00000000..5d3e341b --- /dev/null +++ b/_api-reference/cat/cat-thread-pool.md @@ -0,0 +1,63 @@ +--- +layout: default +title: CAT thread pool +parent: CAT API +nav_order: 75 +has_children: false +redirect_from: +- /opensearch/rest-api/cat/cat-thread-pool/ +--- + +# CAT thread pool +**Introduced 1.0** +{: .label .label-purple } + +The CAT thread pool operation lists the active, queued, and rejected threads of different thread pools on each node. + +## Example + +``` +GET _cat/thread_pool?v +``` +{% include copy-curl.html %} + +If you want to get information for more than one thread pool, separate the thread pool names with commas: + +``` +GET _cat/thread_pool/thread_pool_name_1,thread_pool_name_2,thread_pool_name_3 +``` +{% include copy-curl.html %} + +If you want to limit the information to a specific thread pool, add the thread pool name after your query: + +``` +GET _cat/thread_pool/?v +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +GET _cat/thread_pool +``` + +## URL parameters + +All CAT thread pool URL parameters are optional. + +In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: + +Parameter | Type | Description +:--- | :--- | :--- +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster_manager node. Default is 30 seconds. + + +## Response + +```json +node_name name active queue rejected +odfe-node2 ad-batch-task-threadpool 0 0 0 +odfe-node2 ad-threadpool 0 0 0 +odfe-node2 analyze 0 0 0s +``` diff --git a/_api-reference/cat/index.md b/_api-reference/cat/index.md new file mode 100644 index 00000000..0ddaf1e0 --- /dev/null +++ b/_api-reference/cat/index.md @@ -0,0 +1,86 @@ +--- +layout: default +title: CAT API +nav_order: 10 +has_children: true +redirect_from: + - /opensearch/catapis/ + - /opensearch/rest-api/cat/index/ +--- + +# CAT API +**Introduced 1.0** +{: .label .label-purple } +You can get essential statistics about your cluster in an easy-to-understand, tabular format using the compact and aligned text (CAT) API. The CAT API is a human-readable interface that returns plain text instead of traditional JSON. + +Using the CAT API, you can answer questions like which node is the elected master, what state is the cluster in, how many documents are in each index, and so on. + +## Example + +To see the available operations in the CAT API, use the following command: + +``` +GET _cat +``` +{% include copy-curl.html %} + +## Optional query parameters + +You can use the following query parameters with any CAT API to filter your results. + +Parameter | Description +:--- | :--- | +`v` | Provides verbose output by adding headers to the columns. It also adds some formatting to help align each of the columns together. All examples in this section include the `v` parameter. +`help` | Lists the default and other available headers for a given operation. +`h` | Limits the output to specific headers. +`format` | Returns the result in JSON, YAML, or CBOR formats. +`sort` | Sorts the output by the specified columns. + +### Query parameter usage examples + +You can specify a query parameter to any CAT operation to obtain more specific results. + +### Get verbose output + +To query aliases and get verbose output that includes all column headings in the response, use the `v` query parameter. + +```json +GET _cat/aliases?v +``` +{% include copy-curl.html %} + +The response provides more details, such as names of each column in the response. + +``` +alias index filter routing.index routing.search is_write_index +.kibana .kibana_1 - - - - +sample-alias1 sample-index-1 - - - - +``` +Without the verbose parameter, `v`, the response simply returns the alias names: + +``` + +.kibana .kibana_1 - - - - +sample-alias1 sample-index-1 - - - - +``` + +### Get all available headers + +To see all the available headers, use the `help` parameter: + +``` +GET _cat/?help +``` + +### Get a subset of headers + +To limit the output to a subset of headers, use the `h` parameter: + +``` +GET _cat/?h=,&v +``` + +Typically, for any operation you can find out what headers are available using the `help` parameter, and then use the `h` parameter to limit the output to only the headers that you care about. + +If you use the Security plugin, make sure you have the appropriate permissions. +{: .note } diff --git a/_opensearch/rest-api/cluster-allocation.md b/_api-reference/cluster-api/cluster-allocation.md similarity index 96% rename from _opensearch/rest-api/cluster-allocation.md rename to _api-reference/cluster-api/cluster-allocation.md index 07caee47..da6e3aab 100644 --- a/_opensearch/rest-api/cluster-allocation.md +++ b/_api-reference/cluster-api/cluster-allocation.md @@ -1,12 +1,15 @@ --- layout: default title: Cluster allocation explain -parent: REST API reference nav_order: 10 +parent: Cluster APIs +has_children: false +redirect_from: + - /opensearch/rest-api/cluster-allocation/ --- # Cluster allocation explain -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } The most basic cluster allocation explain request finds an unassigned shard and explains why it can't be allocated to a node. @@ -24,7 +27,7 @@ GET _cluster/allocation/explain?include_yes_decisions=true "primary": true } ``` - +{% include copy-curl.html %} ## Path and HTTP methods diff --git a/_api-reference/cluster-api/cluster-awareness.md b/_api-reference/cluster-api/cluster-awareness.md new file mode 100644 index 00000000..18259b9a --- /dev/null +++ b/_api-reference/cluster-api/cluster-awareness.md @@ -0,0 +1,133 @@ +--- +layout: default +title: Cluster routing and awareness +nav_order: 20 +parent: Cluster APIs +has_children: false +redirect_from: + - /api-reference/cluster-awareness/ + - /opensearch/rest-api/cluster-awareness/ +--- + +# Cluster routing and awareness +**Introduced 1.0** +{: .label .label-purple } + +To control the distribution of search or HTTP traffic, you can use the weights per awareness attribute to control the distribution of search or HTTP traffic across zones. This is commonly used for zonal deployments, heterogeneous instances, and routing traffic away from zones during zonal failure. + +## Path and HTTP methods + +``` +PUT /_cluster/routing/awareness//weights +GET /_cluster/routing/awareness//weights?local +GET /_cluster/routing/awareness//weights +``` + +## Path parameters + +Parameter | Type | Description +:--- | :--- | :--- +attribute | String | The name of the awareness attribute, usually `zone`. The attribute name must match the values listed in the request body when assigning weights to zones. + +## Request body parameters + +Parameter | Type | Description +:--- | :--- | :--- +weights | JSON object | Assigns weights to attributes within the request body of the PUT request. Weights can be set in any ratio, for example, 2:3:5. In a 2:3:5 ratio with 3 zones, for every 100 requests sent to the cluster, each zone would receive either 20, 30, or 50 search requests in a random order. When assigned a weight of `0`, the zone does not receive any search traffic. +_version | String | Implements optimistic concurrency control (OCC) through versioning. The parameter uses simple versioning, such as `1`, and increments upward based on each subsequent modification. This allows any servers from which a request originates to validate whether or not a zone has been modified. + + +In the following example request body, `zone_1` and `zone_2` receive 50 requests each, whereas `zone_3` is prevented from receiving requests: + +``` +{ + "weights": + { + "zone_1": "5", + "zone_2": "5", + "zone_3": "0" + } + "_version" : 1 +} +``` + +## Example: Weighted round robin search + +The following example request creates a round robin shard allocation for search traffic by using an undefined ratio: + +#### Request + +```json +PUT /_cluster/routing/awareness/zone/weights +{ + "weights": + { + "zone_1": "1", + "zone_2": "1", + "zone_3": "0" + } + "_version" : 1 +} +``` +{% include copy-curl.html %} + +#### Response + +``` +{ + "acknowledged": true +} +``` + + +## Example: Getting weights for all zones + +The following example request gets weights for all zones. + +#### Request + +```json +GET /_cluster/routing/awareness/zone/weights +``` +{% include copy-curl.html %} + +#### Response + +OpenSearch responds with the weight of each zone: + +```json +{ + "weights": + { + + "zone_1": "1.0", + "zone_2": "1.0", + "zone_3": "0.0" + }, + "_version":1 +} +``` + +## Example: Deleting weights + +You can remove your weight ratio for each zone using the `DELETE` method. + +#### Request + +```json +DELETE /_cluster/routing/awareness/zone/weights +``` +{% include copy-curl.html %} + +#### Response + +```json +{ + "_version":1 +} +``` + +## Next steps + +- For more information about zone commissioning, see [Cluster decommission]({{site.url}}{{site.baseurl}}/api-reference/cluster-decommission/). +- For more information about allocation awareness, see [Cluster formation]({{site.url}}{{site.baseurl}}/opensearch/cluster/#advanced-step-6-configure-shard-allocation-awareness-or-forced-awareness). diff --git a/_api-reference/cluster-api/cluster-decommission.md b/_api-reference/cluster-api/cluster-decommission.md new file mode 100644 index 00000000..867f58ed --- /dev/null +++ b/_api-reference/cluster-api/cluster-decommission.md @@ -0,0 +1,89 @@ +--- +layout: default +title: Cluster decommission +nav_order: 30 +parent: Cluster APIs +has_children: false +redirect_from: + - /api-reference/cluster-decommission/ + - /opensearch/rest-api/cluster-decommission/ +--- + +# Cluster decommission +**Introduced 1.0** +{: .label .label-purple } + +The cluster decommission operation adds support decommissioning based on awareness. It greatly benefits multi-zone deployments, where awareness attributes, such as `zones`, can aid in applying new upgrades to a cluster in a controlled fashion. This is especially useful during outages, in which case, you can decommission the unhealthy zone to prevent replication requests from stalling and prevent your request backlog from becoming too large. + +For more information about allocation awareness, see [Shard allocation awareness]({{site.url}}{{site.baseurl}}//opensearch/cluster/#shard-allocation-awareness). + + +## HTTP and Path methods + +``` +PUT /_cluster/decommission/awareness/{awareness_attribute_name}/{awareness_attribute_value} +GET /_cluster/decommission/awareness/{awareness_attribute_name}/_status +DELETE /_cluster/decommission/awareness +``` + +## URL parameters + +Parameter | Type | Description +:--- | :--- | :--- +awareness_attribute_name | String | The name of awareness attribute, usually `zone`. +awareness_attribute_value | String | The value of the awareness attribute. For example, if you have shards allocated in two different zones, you can give each zone a value of `zone-a` or `zoneb`. The cluster decommission operation decommissions the zone listed in the method. + + +## Example: Decommissioning and recommissioning a zone + +You can use the following example requests to decommission and recommission a zone: + +#### Request + +The following example request decommissions `zone-a`: + +```json +PUT /_cluster/decommission/awareness// +``` +{% include copy-curl.html %} + +If you want to recommission a decommissioned zone, you can use the `DELETE` method: + +```json +DELETE /_cluster/decommission/awareness +``` +{% include copy-curl.html %} + +#### Response + + +```json +{ + "acknowledged": true +} +``` + +## Example: Getting zone decommission status + +The following example requests returns the decommission status of all zones. + +#### Request + +```json +GET /_cluster/decommission/awareness/zone/_status +``` +{% include copy-curl.html %} + +#### Response + +```json +{ + "zone-1": "INIT | DRAINING | IN_PROGRESS | SUCCESSFUL | FAILED" +} +``` + + +## Next steps + +- For more information about zone awareness and weight, see [Cluster awareness]({{site.url}}{{site.baseurl}}/api-reference/cluster-awareness/). +- For more information about allocation awareness, see [Cluster formation]({{site.url}}{{site.baseurl}}/opensearch/cluster/#advanced-step-6-configure-shard-allocation-awareness-or-forced-awareness). diff --git a/_api-reference/cluster-api/cluster-health.md b/_api-reference/cluster-api/cluster-health.md new file mode 100644 index 00000000..e9e2bb0e --- /dev/null +++ b/_api-reference/cluster-api/cluster-health.md @@ -0,0 +1,217 @@ +--- +layout: default +title: Cluster health +nav_order: 40 +parent: Cluster APIs +has_children: false +redirect_from: + - /api-reference/cluster-health/ + - /opensearch/rest-api/cluster-health/ +--- + +# Cluster health +**Introduced 1.0** +{: .label .label-purple } + +The most basic cluster health request returns a simple status of the health of your cluster. OpenSearch expresses cluster health in three colors: green, yellow, and red. A green status means all primary shards and their replicas are allocated to nodes. A yellow status means all primary shards are allocated to nodes, but some replicas aren't. A red status means at least one primary shard is not allocated to any node. + +To get the status of a specific index, provide the index name. + +## Example + +This request waits 50 seconds for the cluster to reach the yellow status or better: + +``` +GET _cluster/health?wait_for_status=yellow&timeout=50s +``` +{% include copy-curl.html %} + +If the cluster health becomes yellow or green before 50 seconds elapse, it returns a response immediately. Otherwise it returns a response as soon as it exceeds the timeout. + +## Path and HTTP methods + +``` +GET _cluster/health +GET _cluster/health/ +``` + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +expand_wildcards | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. +level | Enum | The level of detail for returned health information. Supported values are `cluster`, `indices`, `shards`, and `awareness_attributes`. Default is `cluster`. +awareness_attribute | String | The name of the awareness attribute, for which to return cluster health (for example, `zone`). Applicable only if `level` is set to `awareness_attributes`. +local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. +timeout | Time | The amount of time to wait for a response. If the timeout expires, the request fails. Default is 30 seconds. +wait_for_active_shards | String | Wait until the specified number of shards is active before returning a response. `all` for all shards. Default is `0`. +wait_for_nodes | String | Wait for N number of nodes. Use `12` for exact match, `>12` and `<12` for range. +wait_for_events | Enum | Wait until all currently queued events with the given priority are processed. Supported values are `immediate`, `urgent`, `high`, `normal`, `low`, and `languid`. +wait_for_no_relocating_shards | Boolean | Whether to wait until there are no relocating shards in the cluster. Default is false. +wait_for_no_initializing_shards | Boolean | Whether to wait until there are no initializing shards in the cluster. Default is false. +wait_for_status | Enum | Wait until the cluster health reaches the specified status or better. Supported values are `green`, `yellow`, and `red`. +weights | JSON object | Assigns weights to attributes within the request body of the PUT request. Weights can be set in any ration, for example, 2:3:5. In a 2:3:5 ratio with three zones, for every 100 requests sent to the cluster, each zone would receive either 20, 30, or 50 search requests in a random order. When assigned a weight of `0`, the zone does not receive any search traffic. + +#### Example request + +The following example request retrieves cluster health for all indexes in the cluster: + +```json +GET _cluster/health +``` +{% include copy-curl.html %} + +#### Example response + +The response contains cluster health information: + +```json +{ + "cluster_name" : "opensearch-cluster", + "status" : "green", + "timed_out" : false, + "number_of_nodes" : 2, + "number_of_data_nodes" : 2, + "discovered_master" : true, + "active_primary_shards" : 6, + "active_shards" : 12, + "relocating_shards" : 0, + "initializing_shards" : 0, + "unassigned_shards" : 0, + "delayed_unassigned_shards" : 0, + "number_of_pending_tasks" : 0, + "number_of_in_flight_fetch" : 0, + "task_max_waiting_in_queue_millis" : 0, + "active_shards_percent_as_number" : 100.0 +} +``` + +## Response fields + +The following table lists all response fields. + +|Field |Data type |Description | +|:--- |:--- |:--- | +|cluster_name | String | The name of the cluster. | +|status | String | The cluster health status, which represents the state of shard allocation in the cluster. May be `green`, `yellow`, or `red`. | +|number_of_nodes | Integer | The number of nodes in the cluster. | +|number_of_data_nodes | Integer | The number of data nodes in the cluster. | +|discovered_cluster_manager | Boolean | Specifies whether the cluster manager is discovered. | +|active_primary_shards | Integer | The number of active primary shards. | +|active_shards | Integer | The total number of active shards, including primary and replica shards. | +|relocating_shards | Integer | The number of relocating shards. | +|initializing_shards | Integer | The number of initializing shards. | +|unassigned_shards | Integer | The number of unassigned shards. | +|delayed_unassigned_shards | Integer | The number of delayed unassigned shards. | +|number_of_pending_tasks | Integer | The number of pending tasks in the cluster. | +|number_of_in_flight_fetch | Integer | The number of unfinished fetches. | +|task_max_waiting_in_queue_millis | Integer | The maximum wait time for all tasks waiting to be performed, in milliseconds. | +|active_shards_percent_as_number | Double | The percentage of active shards in the cluster. | +|awareness_attributes | Object | Contains cluster health information for each awareness attribute. | + +## Returning cluster health by awareness attribute + +To check cluster health by awareness attribute (for example, zone or rack), specify `awareness_attributes` in the `level` query parameter: + +```json +GET _cluster/health?level=awareness_attributes +``` +{% include copy-curl.html %} + +The response contains cluster health metrics partitioned by awareness attribute: + +```json +{ + "cluster_name": "runTask", + "status": "green", + "timed_out": false, + "number_of_nodes": 3, + "number_of_data_nodes": 3, + "discovered_master": true, + "discovered_cluster_manager": true, + "active_primary_shards": 0, + "active_shards": 0, + "relocating_shards": 0, + "initializing_shards": 0, + "unassigned_shards": 0, + "delayed_unassigned_shards": 0, + "number_of_pending_tasks": 0, + "number_of_in_flight_fetch": 0, + "task_max_waiting_in_queue_millis": 0, + "active_shards_percent_as_number": 100, + "awareness_attributes": { + "zone": { + "zone-3": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + }, + "zone-1": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + }, + "zone-2": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + } + }, + "rack": { + "rack-3": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + }, + "rack-1": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + }, + "rack-2": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + } + } + } +} +``` + +If you're interested in a particular awareness attribute, you can include the name of the awareness attribute as a query parameter: + +```json +GET _cluster/health?level=awareness_attributes&awareness_attribute=zone +``` +{% include copy-curl.html %} + +In response to the preceding request, OpenSearch returns cluster health information only for the `zone` awareness attribute. + +The unassigned shard information will be accurate only if you [enable replica count enforcement]({{site.url}}{{site.baseurl}}/opensearch/cluster#replica-count-enforcement) and [configure forced awareness]({{site.url}}{{site.baseurl}}/opensearch/cluster#forced-awareness) for the awareness attribute either before cluster start or after cluster start but before any indexing requests. If you enable replica enforcement after the cluster receives indexing requests, the unassigned shard information may be inaccurate. If you don't configure replica count enforcement and forced awareness, the `unassigned_shards` field will contain -1. +{: .warning} + +## Required permissions + +If you use the Security plugin, make sure you have the appropriate permissions: +`cluster:monitor/health`. diff --git a/_api-reference/cluster-api/cluster-settings.md b/_api-reference/cluster-api/cluster-settings.md new file mode 100644 index 00000000..35383390 --- /dev/null +++ b/_api-reference/cluster-api/cluster-settings.md @@ -0,0 +1,91 @@ +--- +layout: default +title: Cluster settings +nav_order: 50 +parent: Cluster APIs +redirect_from: + - /api-reference/cluster-settings/ + - /opensearch/rest-api/cluster-settings/ +--- + +# Cluster settings +**Introduced 1.0** +{: .label .label-purple } + +The cluster settings operation lets you check the current settings for your cluster, review default settings, and change settings. When you update a setting using the API, OpenSearch applies it to all nodes in the cluster. + +## Path and HTTP methods + +``` +GET _cluster/settings +PUT _cluster/settings +``` + +## Path parameters + +All cluster setting parameters are optional. + +Parameter | Data type | Description +:--- | :--- | :--- +flat_settings | Boolean | Whether to return settings in the flat form, which can improve readability, especially for heavily nested settings. For example, the flat form of `"cluster": { "max_shards_per_node": 500 }` is `"cluster.max_shards_per_node": "500"`. +include_defaults (GET only) | Boolean | Whether to include default settings as part of the response. This parameter is useful for identifying the names and current values of settings you want to update. +cluster_manager_timeout | Time unit | The amount of time to wait for a response from the cluster manager node. Default is `30 seconds`. +timeout (PUT only) | Time unit | The amount of time to wait for a response from the cluster. Default is `30 seconds`. + + +#### Example request + +```json +GET _cluster/settings?include_defaults=true +``` +{% include copy-curl.html %} + +#### Example response + +```json +PUT _cluster/settings +{ + "persistent":{ + "action.auto_create_index": false + } +} +``` + +## Request fields + +The GET operation has no request body options. All cluster setting field parameters are optional. + +Not all cluster settings can be updated using the cluster settings API. You will receive the error message `"setting [cluster.some.setting], not dynamically updateable"` when trying to configure these settings through the API. +{: .note } + +For a listing of all cluster settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). + +#### Example request + +For a PUT operation, the request body must contain `transient` or `persistent`, along with the setting you want to update: + +```json +PUT _cluster/settings +{ + "persistent":{ + "cluster.max_shards_per_node": 500 + } +} +``` +{% include copy-curl.html %} + +For more information about transient settings, persistent settings, and precedence, see [OpenSearch configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/). + +#### Example response + +```json +{ + "acknowledged":true, + "persistent":{ + "cluster":{ + "max_shards_per_node":"500" + } + }, + "transient":{} +} +``` diff --git a/_api-reference/cluster-api/cluster-stats.md b/_api-reference/cluster-api/cluster-stats.md new file mode 100644 index 00000000..8f8b585a --- /dev/null +++ b/_api-reference/cluster-api/cluster-stats.md @@ -0,0 +1,519 @@ +--- +layout: default +title: Cluster stats +nav_order: 60 +parent: Cluster APIs +has_children: false +redirect_from: + - /api-reference/cluster-stats/ + - /opensearch/rest-api/cluster-stats/ +--- + +# Cluster stats +**Introduced 1.0** +{: .label .label-purple } + +The cluster stats API operation returns statistics about your cluster. + +## Example + +```json +GET _cluster/stats/nodes/_cluster_manager +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +```json +GET _cluster/stats +GET _cluster/stats/nodes/ +``` + +## URL parameters + +All cluster stats parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +<node-filters> | List | A comma-separated list of [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters) that OpenSearch uses to filter results. + + + Although the `master` node is now called `cluster_manager` for version 2.0, we retained the `master` field for backwards compatibility. If you have a node that has either a `master` role or a `cluster_manager` role, the `count` increases for both fields by 1. To see an example node count increase, see the Response sample. + {: .note } + +## Response + +```json +{ + "_nodes": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "cluster_name": "opensearch-cluster", + "cluster_uuid": "QravFieJS_SlZJyBMcDMqQ", + "timestamp": 1644607845054, + "status": "yellow", + "indices": { + "count": 114, + "shards": { + "total": 121, + "primaries": 60, + "replication": 1.0166666666666666, + "index": { + "shards": { + "min": 1, + "max": 2, + "avg": 1.0614035087719298 + }, + "primaries": { + "min": 0, + "max": 2, + "avg": 0.5263157894736842 + }, + "replication": { + "min": 0.0, + "max": 1.0, + "avg": 0.008771929824561403 + } + } + }, + "docs": { + "count": 134263, + "deleted": 115 + }, + "store": { + "size_in_bytes": 70466547, + "reserved_in_bytes": 0 + }, + "fielddata": { + "memory_size_in_bytes": 664, + "evictions": 0 + }, + "query_cache": { + "memory_size_in_bytes": 0, + "total_count": 1, + "hit_count": 0, + "miss_count": 1, + "cache_size": 0, + "cache_count": 0, + "evictions": 0 + }, + "completion": { + "size_in_bytes": 0 + }, + "segments": { + "count": 341, + "memory_in_bytes": 3137244, + "terms_memory_in_bytes": 2488992, + "stored_fields_memory_in_bytes": 167672, + "term_vectors_memory_in_bytes": 0, + "norms_memory_in_bytes": 346816, + "points_memory_in_bytes": 0, + "doc_values_memory_in_bytes": 133764, + "index_writer_memory_in_bytes": 0, + "version_map_memory_in_bytes": 0, + "fixed_bit_set_memory_in_bytes": 1112, + "max_unsafe_auto_id_timestamp": 1644269449096, + "remote_store" : { + "upload" : { + "total_upload_size" : { + "started_bytes" : 152419, + "succeeded_bytes" : 152419, + "failed_bytes" : 0 + }, + "refresh_size_lag" : { + "total_bytes" : 0, + "max_bytes" : 0 + }, + "max_refresh_time_lag_in_millis" : 0, + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } + }, + "download" : { + "total_download_size" : { + "started_bytes" : 0, + "succeeded_bytes" : 0, + "failed_bytes" : 0 + }, + "total_time_spent_in_millis" : 0 + } + }, + "file_sizes": {} + }, + "mappings": { + "field_types": [ + { + "name": "alias", + "count": 1, + "index_count": 1 + }, + { + "name": "binary", + "count": 1, + "index_count": 1 + }, + { + "name": "boolean", + "count": 87, + "index_count": 22 + }, + { + "name": "date", + "count": 185, + "index_count": 91 + }, + { + "name": "double", + "count": 5, + "index_count": 2 + }, + { + "name": "float", + "count": 4, + "index_count": 1 + }, + { + "name": "geo_point", + "count": 4, + "index_count": 3 + }, + { + "name": "half_float", + "count": 12, + "index_count": 1 + }, + { + "name": "integer", + "count": 144, + "index_count": 29 + }, + { + "name": "ip", + "count": 2, + "index_count": 1 + }, + { + "name": "keyword", + "count": 1939, + "index_count": 109 + }, + { + "name": "knn_vector", + "count": 1, + "index_count": 1 + }, + { + "name": "long", + "count": 158, + "index_count": 92 + }, + { + "name": "nested", + "count": 25, + "index_count": 10 + }, + { + "name": "object", + "count": 420, + "index_count": 91 + }, + { + "name": "text", + "count": 1768, + "index_count": 102 + } + ] + }, + "analysis": { + "char_filter_types": [], + "tokenizer_types": [], + "filter_types": [], + "analyzer_types": [], + "built_in_char_filters": [], + "built_in_tokenizers": [], + "built_in_filters": [], + "built_in_analyzers": [ + { + "name": "english", + "count": 1, + "index_count": 1 + } + ] + } + }, + "nodes": { + "count": { + "total": 1, + "coordinating_only": 0, + "data": 1, + "ingest": 1, + "master": 1, + "cluster_manager": 1, + "remote_cluster_client": 1 + }, + "versions": [ + "1.2.4" + ], + "os": { + "available_processors": 6, + "allocated_processors": 6, + "names": [ + { + "name": "Linux", + "count": 1 + } + ], + "pretty_names": [ + { + "pretty_name": "Amazon Linux 2", + "count": 1 + } + ], + "mem": { + "total_in_bytes": 6232674304, + "free_in_bytes": 1452658688, + "used_in_bytes": 4780015616, + "free_percent": 23, + "used_percent": 77 + } + }, + "process": { + "cpu": { + "percent": 0 + }, + "open_file_descriptors": { + "min": 970, + "max": 970, + "avg": 970 + } + }, + "jvm": { + "max_uptime_in_millis": 108800629, + "versions": [ + { + "version": "15.0.1", + "vm_name": "OpenJDK 64-Bit Server VM", + "vm_version": "15.0.1+9", + "vm_vendor": "AdoptOpenJDK", + "bundled_jdk": true, + "using_bundled_jdk": true, + "count": 1 + } + ], + "mem": { + "heap_used_in_bytes": 178956256, + "heap_max_in_bytes": 536870912 + }, + "threads": 112 + }, + "fs": { + "total_in_bytes": 62725623808, + "free_in_bytes": 28442726400, + "available_in_bytes": 25226010624 + }, + "plugins": [ + { + "name": "opensearch-index-management", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "OpenSearch Index Management Plugin", + "classname": "org.opensearch.indexmanagement.IndexManagementPlugin", + "custom_foldername": "", + "extended_plugins": [ + "opensearch-job-scheduler" + ], + "has_native_controller": false + }, + { + "name": "opensearch-security", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "Provide access control related features for OpenSearch 1.0.0", + "classname": "org.opensearch.security.OpenSearchSecurityPlugin", + "custom_foldername": "opensearch-security", + "extended_plugins": [], + "has_native_controller": false + }, + { + "name": "opensearch-cross-cluster-replication", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "OpenSearch Cross Cluster Replication Plugin", + "classname": "org.opensearch.replication.ReplicationPlugin", + "custom_foldername": "", + "extended_plugins": [], + "has_native_controller": false + }, + { + "name": "opensearch-job-scheduler", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "OpenSearch Job Scheduler plugin", + "classname": "org.opensearch.jobscheduler.JobSchedulerPlugin", + "custom_foldername": "", + "extended_plugins": [], + "has_native_controller": false + }, + { + "name": "opensearch-anomaly-detection", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "OpenSearch anomaly detector plugin", + "classname": "org.opensearch.ad.AnomalyDetectorPlugin", + "custom_foldername": "", + "extended_plugins": [ + "lang-painless", + "opensearch-job-scheduler" + ], + "has_native_controller": false + }, + { + "name": "opensearch-performance-analyzer", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "OpenSearch Performance Analyzer Plugin", + "classname": "org.opensearch.performanceanalyzer.PerformanceAnalyzerPlugin", + "custom_foldername": "", + "extended_plugins": [], + "has_native_controller": false + }, + { + "name": "opensearch-reports-scheduler", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "Scheduler for Dashboards Reports Plugin", + "classname": "org.opensearch.reportsscheduler.ReportsSchedulerPlugin", + "custom_foldername": "", + "extended_plugins": [ + "opensearch-job-scheduler" + ], + "has_native_controller": false + }, + { + "name": "opensearch-asynchronous-search", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "Provides support for asynchronous search", + "classname": "org.opensearch.search.asynchronous.plugin.AsynchronousSearchPlugin", + "custom_foldername": "", + "extended_plugins": [], + "has_native_controller": false + }, + { + "name": "opensearch-knn", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "OpenSearch k-NN plugin", + "classname": "org.opensearch.knn.plugin.KNNPlugin", + "custom_foldername": "", + "extended_plugins": [ + "lang-painless" + ], + "has_native_controller": false + }, + { + "name": "opensearch-alerting", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "Amazon OpenSearch alerting plugin", + "classname": "org.opensearch.alerting.AlertingPlugin", + "custom_foldername": "", + "extended_plugins": [ + "lang-painless" + ], + "has_native_controller": false + }, + { + "name": "opensearch-observability", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "OpenSearch Plugin for OpenSearch Dashboards Observability", + "classname": "org.opensearch.observability.ObservabilityPlugin", + "custom_foldername": "", + "extended_plugins": [], + "has_native_controller": false + }, + { + "name": "opensearch-sql", + "version": "1.2.4.0", + "opensearch_version": "1.2.4", + "java_version": "1.8", + "description": "OpenSearch SQL", + "classname": "org.opensearch.sql.plugin.SQLPlugin", + "custom_foldername": "", + "extended_plugins": [], + "has_native_controller": false + } + ], + "network_types": { + "transport_types": { + "org.opensearch.security.ssl.http.netty.SecuritySSLNettyTransport": 1 + }, + "http_types": { + "org.opensearch.security.http.SecurityHttpServerTransport": 1 + } + }, + "discovery_types": { + "zen": 1 + }, + "packaging_types": [ + { + "type": "tar", + "count": 1 + } + ], + "ingest": { + "number_of_pipelines": 0, + "processor_stats": {} + } + } +} +``` + +## Response body fields + +Field | Description +:--- | :--- +nodes | How many nodes returned in the response. +cluster_name | The cluster's name. +cluster_uuid | The cluster's uuid. +timestamp | The Unix epoch time of when the cluster was last refreshed. +status | The cluster's health status. +indices | Statistics about the indexes in the cluster. +indices.count | How many indexes are in the cluster. +indices.shards | Information about the cluster's shards. +indices.docs | How many documents are still in the cluster and how many documents are deleted. +indices.store | Information about the cluster's storage. +indices.fielddata | Information about the cluster's field data +indices.query_cache | Data about the cluster's query cache. +indices.completion | How many bytes in memory are used to complete operations. +indices.segments | Information about the cluster's segments, which are small Lucene indexes. +indices.mappings | Mappings within the cluster. +indices.analysis | Information about analyzers used in the cluster. +nodes | Statistics about the nodes in the cluster. +nodes.count | How many nodes were returned from the request. +nodes.versions | OpenSearch's version number. +nodes.os | Information about the operating systems used in the nodes. +nodes.process | The processes the returned nodes use. +nodes.jvm | Statistics about the Java Virtual Machines in use. +nodes.fs | The nodes' file storage. +nodes.plugins | The OpenSearch plugins integrated within the nodes. +nodes.network_types | The transport and HTTP networks within the nodes. +nodes.discovery_type | The method the nodes use to find other nodes within the cluster. +nodes.packaging_types | Information about the nodes' OpenSearch distribution. +nodes.ingest | Information about the nodes' ingest pipelines/nodes, if there are any. +total_time_spent | The total amount of download and upload time spent across all shards in the cluster when downloading or uploading from the remote store. diff --git a/_api-reference/cluster-api/index.md b/_api-reference/cluster-api/index.md new file mode 100644 index 00000000..4432c114 --- /dev/null +++ b/_api-reference/cluster-api/index.md @@ -0,0 +1,14 @@ +--- +layout: default +title: Cluster APIs +has_children: true +nav_order: 15 +redirect_from: + - /opensearch/api-reference/cluster-api/ +--- + +# Cluster APIs +**Introduced 1.0** +{: .label .label-purple } + +The cluster APIs allow you to manage your cluster. You can use them to check cluster health, modify settings, retrieve statistics, and more. diff --git a/_api-reference/common-parameters.md b/_api-reference/common-parameters.md new file mode 100644 index 00000000..347d38a0 --- /dev/null +++ b/_api-reference/common-parameters.md @@ -0,0 +1,92 @@ +--- +layout: default +title: Common REST Parameters +nav_order: 93 +redirect_from: + - /opensearch/common-parameters/ +--- + +# Common REST parameters +**Introduced 1.0** +{: .label .label-purple } + +OpenSearch supports the following parameters for all REST operations: + +## Human-readable output + +To convert output units to human-readable values (for example, `1h` for 1 hour and `1kb` for 1,024 bytes), add `?human=true` to the request URL. + +#### Example request + +The following request requires response values to be in human-readable format: + +```json + +GET /_search?human=true +``` + +## Pretty result + +To get back JSON responses in a readable format, add `?pretty=true` to the request URL. + +#### Example request + +The following request requires the response to be displayed in pretty JSON format: + +```json + +GET /_search?pretty=true +``` + +## Content type + +To specify the type of content in the request body, use the `Content-Type` key name in the request header. Most operations support JSON, YAML, and CBOR formats. + +#### Example request + +The following request specifies JSON format for the request body: + +```json + +curl -H "Content-type: application/json" -XGET localhost:9200/_scripts/ +``` + +## Request body in query string + +If the client library does not accept a request body for non-POST requests, use the `source` query string parameter to pass the request body. Also, specify the `source_content_type` parameter with a supported media type such as `application/json`. + + +#### Example request + +The following request searches the documents in the `shakespeare` index for a specific field and value: + +```json + +GET shakespeare/search?source={"query":{"exists":{"field":"speaker"}}}&source_content_type=application/json +``` + +## Stack traces + +To include the error stack trace in the response when an exception is raised, add `error_trace=true` to the request URL. + +#### Example request + +The following request sets `error_trace` to `true` so that the response returns exception-triggered errors: + +```json + +GET /_search?error_trace=true +``` + +## Filtered responses + +To reduce the response size use the `filter_path` parameter to filter the fields that are returned. This parameter takes a comma-separated list of filters. It supports using wildcards to match any field or part of a field's name. You can also exclude fields with `-`. + +#### Example request + +The following request specifies filters to limit the fields returned in the response: + +```json + +GET _search?filter_path=.*,- +``` diff --git a/_opensearch/rest-api/count.md b/_api-reference/count.md similarity index 80% rename from _opensearch/rest-api/count.md rename to _api-reference/count.md index c3463a08..3e777a41 100644 --- a/_opensearch/rest-api/count.md +++ b/_api-reference/count.md @@ -1,18 +1,18 @@ --- layout: default title: Count -parent: REST API reference -nav_order: 150 +nav_order: 21 +redirect_from: + - /opensearch/rest-api/count/ --- # Count -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } The count API gives you quick access to the number of documents that match a query. You can also use it to check the document count of an index, data stream, or cluster. - ## Example To see the number of documents that match a query: @@ -27,6 +27,7 @@ GET opensearch_dashboards_sample_data_logs/_count } } ``` +{% include copy-curl.html %} The following call to the search API produces equivalent results: @@ -42,12 +43,14 @@ GET opensearch_dashboards_sample_data_logs/_search "track_total_hits": true } ``` +{% include copy-curl.html %} To see the number of documents in an index: ```json GET opensearch_dashboards_sample_data_logs/_count ``` +{% include copy-curl.html %} To check for the number of documents in a [data stream]({{site.url}}{{site.baseurl}}/opensearch/data-streams/), replace the index name with the data stream name. @@ -56,8 +59,9 @@ To see the number of documents in your cluster: ```json GET _count ``` +{% include copy-curl.html %} -Alternatively, you could use the [cat indices]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/cat-indices/) and [cat count]({{site.url}}{{site.baseurl}}/opensearch/rest-api/cat/cat-count/) APIs to see the number of documents per index or data stream. +Alternatively, you could use the [cat indexes]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-indices/) and [cat count]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-count/) APIs to see the number of documents per index or data stream. {: .note } @@ -75,13 +79,13 @@ All count parameters are optional. Parameter | Type | Description :--- | :--- | :--- -`allow_no_indices` | Boolean | If false, the request returns an error if any wildcard expression or index alias targets any closed or missing indices. Default is false. +`allow_no_indices` | Boolean | If false, the request returns an error if any wildcard expression or index alias targets any closed or missing indexes. Default is false. `analyzer` | String | The analyzer to use in the query string. `analyze_wildcard` | Boolean | Specifies whether to analyze wildcard and prefix queries. Default is false. `default_operator` | String | Indicates whether the default operator for a string query should be AND or OR. Default is OR. `df` | String | The default field in case a field prefix is not provided in the query string. -`expand_wildcards` | String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values. Valid values are `all` (match any index), `open` (match open, non-hidden indices), `closed` (match closed, non-hidden indices), `hidden` (match hidden indices), and `none` (deny wildcard expressions). Default is `open`. -`ignore_unavailable` | Boolean | Specifies whether to include missing or closed indices in the response. Default is false. +`expand_wildcards` | String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values. Valid values are `all` (match any index), `open` (match open, non-hidden indexes), `closed` (match closed, non-hidden indexes), `hidden` (match hidden indexes), and `none` (deny wildcard expressions). Default is `open`. +`ignore_unavailable` | Boolean | Specifies whether to include missing or closed indexes in the response. Default is false. `lenient` | Boolean | Specifies whether OpenSearch should accept requests if queries have format errors (for example, querying a text field for an integer). Default is false. `min_score` | Float | Include only documents with a minimum `_score` value in the result. `routing` | String | Value used to route the operation to a specific shard. diff --git a/_opensearch/rest-api/document-apis/bulk.md b/_api-reference/document-apis/bulk.md similarity index 74% rename from _opensearch/rest-api/document-apis/bulk.md rename to _api-reference/document-apis/bulk.md index c10a3932..d0023081 100644 --- a/_opensearch/rest-api/document-apis/bulk.md +++ b/_api-reference/document-apis/bulk.md @@ -2,17 +2,21 @@ layout: default title: Bulk parent: Document APIs -grand_parent: REST API reference nav_order: 20 +redirect_from: + - /opensearch/rest-api/document-apis/bulk/ --- # Bulk -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } -The bulk operation lets you add, update, or delete many documents in a single request. Compared to individual OpenSearch indexing requests, the bulk operation has significant performance benefits. Whenever practical, we recommend batching indexing operations into bulk requests. +The bulk operation lets you add, update, or delete multiple documents in a single request. Compared to individual OpenSearch indexing requests, the bulk operation has significant performance benefits. Whenever practical, we recommend batching indexing operations into bulk requests. +Beginning in OpenSearch 2.9, when indexing documents using the bulk operation, the document `_id` must be 512 bytes or less in size. +{: .note} + ## Example ```json @@ -26,6 +30,7 @@ POST _bulk { "doc" : { "title": "World War Z" } } ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -35,7 +40,7 @@ POST _bulk POST /_bulk ``` -Specifying the index in the path means you don't need to include it in the [request body]({{site.url}}{{site.baseurl}}/opensearch/rest-api/document-apis/bulk/#request-body). +Specifying the index in the path means you don't need to include it in the [request body]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/#request-body). OpenSearch also accepts PUT requests to the `_bulk` path, but we highly recommend using POST. The accepted usage of PUT---adding or replacing a single resource at a given path---doesn't make sense for bulk requests. {: .note } @@ -52,7 +57,7 @@ refresh | Enum | Whether to refresh the affected shards after performing the ind require_alias | Boolean | Set to `true` to require that all actions target an index alias rather than an index. Default is `false`. routing | String | Routes the request to the specified shard. timeout | Time | How long to wait for the request to return. Default `1m`. -type | String | (Deprecated) The default document type for documents that don't specify a type. Default is `_doc`. We highly recommend ignoring this parameter and using a type of `_doc` for all indices. +type | String | (Deprecated) The default document type for documents that don't specify a type. Default is `_doc`. We highly recommend ignoring this parameter and using a type of `_doc` for all indexes. wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the bulk request. Default is 1 (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the request to succeed. {% comment %}_source | List | asdf _source_excludes | list | asdf @@ -77,7 +82,7 @@ All actions support the same metadata: `_index`, `_id`, and `_require_alias`. If - Create - Creates a document if it doesn't already exist and returns an error otherwise. The next line must include a JSON document. + Creates a document if it doesn't already exist and returns an error otherwise. The next line must include a JSON document: ```json { "create": { "_index": "movies", "_id": "tt1392214" } } @@ -86,7 +91,7 @@ All actions support the same metadata: `_index`, `_id`, and `_require_alias`. If - Delete - This action deletes a document if it exists. If the document doesn't exist, OpenSearch doesn't return an error, but instead returns `not_found` under `result`. Delete actions don't require documents on the next line. + This action deletes a document if it exists. If the document doesn't exist, OpenSearch doesn't return an error but instead returns `not_found` under `result`. Delete actions don't require documents on the next line: ```json { "delete": { "_index": "movies", "_id": "tt2229499" } } @@ -94,7 +99,7 @@ All actions support the same metadata: `_index`, `_id`, and `_require_alias`. If - Index - Index actions create a document if it doesn't yet exist and replace the document if it already exists. The next line must include a JSON document. + Index actions create a document if it doesn't yet exist and replace the document if it already exists. The next line must include a JSON document: ```json { "index": { "_index": "movies", "_id": "tt1979320" } } @@ -103,13 +108,28 @@ All actions support the same metadata: `_index`, `_id`, and `_require_alias`. If - Update - This action updates existing documents and returns an error if the document doesn't exist. The next line must include a full or partial JSON document, depending on how much of the document you want to update. It can also include a script or upsert for more complex document updates. + By default, this action updates existing documents and returns an error if the document doesn't exist. The next line must include a full or partial JSON document, depending on how much of the document you want to update: ```json { "update": { "_index": "movies", "_id": "tt0816711" } } { "doc" : { "title": "World War Z" } } ``` + To upsert a document, specify `doc_as_upsert` as `true`. If a document exists, it is updated; if it does not exist, a new document is indexed with the parameters specified in the `doc` field: + + - Upsert + ```json + { "update": { "_index": "movies", "_id": "tt0816711" } } + { "doc" : { "title": "World War Z" }, "doc_as_upsert": true } + ``` + + You can specify a script for more complex document updates: + + - Script + ```json + { "update": { "_index": "movies", "_id": "tt0816711" } } + { "script" : { "source": "ctx._source.title = \"World War Z\"" } } + ``` ## Response @@ -123,7 +143,6 @@ In the response, pay particular attention to the top-level `errors` boolean. If { "index": { "_index": "movies", - "_type": "_doc", "_id": "tt1979320", "_version": 1, "result": "created", @@ -140,7 +159,6 @@ In the response, pay particular attention to the top-level `errors` boolean. If { "create": { "_index": "movies", - "_type": "_doc", "_id": "tt1392214", "status": 409, "error": { @@ -155,7 +173,6 @@ In the response, pay particular attention to the top-level `errors` boolean. If { "update": { "_index": "movies", - "_type": "_doc", "_id": "tt0816711", "status": 404, "error": { diff --git a/_opensearch/rest-api/document-apis/delete-by-query.md b/_api-reference/document-apis/delete-by-query.md similarity index 83% rename from _opensearch/rest-api/document-apis/delete-by-query.md rename to _api-reference/document-apis/delete-by-query.md index e858d1d1..b205ed76 100644 --- a/_opensearch/rest-api/document-apis/delete-by-query.md +++ b/_api-reference/document-apis/delete-by-query.md @@ -2,12 +2,13 @@ layout: default title: Delete by query parent: Document APIs -grand_parent: REST API reference nav_order: 40 +redirect_from: + - /opensearch/rest-api/document-apis/delete-by-query/ --- # Delete by query -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple} You can include a query as part of your delete request so OpenSearch deletes all documents that match that query. @@ -24,6 +25,7 @@ POST sample-index1/_delete_by_query } } ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -37,16 +39,16 @@ All URL parameters are optional. Parameter | Type | Description :--- | :--- | :--- | :--- -<index> | String | Name or list of the data streams, indices, or aliases to delete from. Supports wildcards. If left blank, OpenSearch searches all indices. -allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indices. Default is `true`. +<index> | String | Name or list of the data streams, indexes, or aliases to delete from. Supports wildcards. If left blank, OpenSearch searches all indexes. +allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. analyzer | String | The analyzer to use in the query string. analyze_wildcard | Boolean | Specifies whether to analyze wildcard and prefix queries. Default is false. conflicts | String | Indicates to OpenSearch what should happen if the delete by query operation runs into a version conflict. Valid options are `abort` and `proceed`. Default is `abort`. default_operator | String | Indicates whether the default operator for a string query should be AND or OR. Default is OR. df | String | The default field in case a field prefix is not provided in the query string. -expand_wildcards | String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values. Valid values are `all` (match any index), `open` (match open, non-hidden indices), `closed` (match closed, non-hidden indices), `hidden` (match hidden indices), and `none` (deny wildcard expressions). Default is `open`. +expand_wildcards | String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values. Valid values are `all` (match any index), `open` (match open, non-hidden indexes), `closed` (match closed, non-hidden indexes), `hidden` (match hidden indexes), and `none` (deny wildcard expressions). Default is `open`. from | Integer | The starting index to search from. Default is 0. -ignore_unavailable | Boolean | Specifies whether to include missing or closed indices in the response. Default is false. +ignore_unavailable | Boolean | Specifies whether to include missing or closed indexes in the response. Default is false. lenient | Boolean | Specifies whether OpenSearch should accept requests if queries have format errors (for example, querying a text field for an integer). Default is false. max_docs | Integer | How many documents the delete by query operation should process at most. Default is all documents. preference | String | Specifies which shard or node OpenSearch should perform the delete by query operation on. @@ -59,6 +61,7 @@ scroll | Time | Amount of time the search context should be open. scroll_size | Integer | Size of the operation's scroll requests. Default is 1000. search_type | String | Whether OpenSearch should use global term and document frequencies calculating revelance scores. Valid choices are `query_then_fetch` and `dfs_query_then_fetch`. `query_then_fetch` scores documents using local term and document frequencies for the shard. It’s usually faster but less accurate. `dfs_query_then_fetch` scores documents using global term and document frequencies across all shards. It’s usually slower but more accurate. Default is `query_then_fetch`. search_timeout | Time | How long to wait until OpenSearch deems the request timed out. Default is no timeout. +slices | String or Integer | How many slices to cut the operation into for faster processing. Specify an integer to set how many slices to divide the operation into, or use `auto`, which tells OpenSearch it should decide how many slices to divide into. If you have a lot of shards in your index, set a lower number for better efficiency. Default is 1, which means the task should not be divided. sort | String | A comma-separated list of <field> : <direction> pairs to sort by. _source | String | Specifies whether to include the `_source` field in the response. _source_excludes | String | A comma-separated list of source fields to exclude from the response. @@ -68,10 +71,12 @@ terminate_after | Integer | The maximum number of documents OpenSearch should pr timeout | Time | How long the operation should wait from a response from active shards. Default is `1m`. version | Boolean | Whether to include the document version as a match. wait_for_active_shards | String | The number of shards that must be active before OpenSearch executes the operation. Valid values are `all` or any integer up to the total number of shards in the index. Default is 1, which is the primary shard. +wait_for_completion | Boolean | Setting this parameter to false indicates to OpenSearch it should not wait for completion and perform this request asynchronously. Asynchronous requests run in the background, and you can use the [Tasks]({{site.url}}{{site.baseurl}}/api-reference/tasks) API to monitor progress. + ## Request body -To search your index for specific documents, you must include a [query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index) in the request body that OpenSearch uses to match documents. If you don't use a query, OpenSearch treats your delete request as a simple [delete document operation]({{site.url}}{{site.baseurl}}/opensearch/rest-api/document-apis/delete-document). +To search your index for specific documents, you must include a [query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index) in the request body that OpenSearch uses to match documents. If you don't use a query, OpenSearch treats your delete request as a simple [delete document operation]({{site.url}}{{site.baseurl}}/api-reference/document-apis/delete-document). ```json { diff --git a/_opensearch/rest-api/document-apis/delete-document.md b/_api-reference/document-apis/delete-document.md similarity index 92% rename from _opensearch/rest-api/document-apis/delete-document.md rename to _api-reference/document-apis/delete-document.md index a1dd9254..c3dea2f7 100644 --- a/_opensearch/rest-api/document-apis/delete-document.md +++ b/_api-reference/document-apis/delete-document.md @@ -2,12 +2,13 @@ layout: default title: Delete document parent: Document APIs -grand_parent: REST API reference nav_order: 15 +redirect_from: + - /opensearch/rest-api/document-apis/delete-document/ --- # Delete document -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } If you no longer need a document in your index, you can use the delete document API operation to delete it. @@ -17,6 +18,7 @@ If you no longer need a document in your index, you can use the delete document ``` DELETE /sample-index1/_doc/1 ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -44,7 +46,6 @@ wait_for_active_shards | String | The number of active shards that must be avail ```json { "_index": "sample-index1", - "_type": "_doc", "_id": "1", "_version": 2, "result": "deleted", @@ -63,13 +64,12 @@ wait_for_active_shards | String | The number of active shards that must be avail Field | Description :--- | :--- _index | The name of the index. -_type | The document's type. OpenSearch only supports one type, which is `_doc`. _id | The document's ID. _version | The document's version. _result | The result of the delete operation. _shards | Detailed information about the cluster's shards. total | The total number of shards. -successful | The number of shards OpenSearch succssfully deleted the document from. +successful | The number of shards OpenSearch successfully deleted the document from. failed | The number of shards OpenSearch failed to delete the document from. _seq_no | The sequence number assigned when the document was indexed. _primary_term | The primary term assigned when the document was indexed. diff --git a/_opensearch/rest-api/document-apis/get-documents.md b/_api-reference/document-apis/get-documents.md similarity index 93% rename from _opensearch/rest-api/document-apis/get-documents.md rename to _api-reference/document-apis/get-documents.md index d5aad0d1..d5c2e52d 100644 --- a/_opensearch/rest-api/document-apis/get-documents.md +++ b/_api-reference/document-apis/get-documents.md @@ -2,12 +2,13 @@ layout: default title: Get document parent: Document APIs -grand_parent: REST API reference nav_order: 5 +redirect_from: + - /opensearch/rest-api/document-apis/get-documents/ --- # Get document -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } After adding a JSON document to your index, you can use the get document API operation to retrieve the document's information and data. @@ -17,6 +18,7 @@ After adding a JSON document to your index, you can use the get document API ope ```json GET sample-index1/_doc/1 ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -51,7 +53,6 @@ version_type | Enum | Retrieves a specifically typed document. Available options ```json { "_index": "sample-index1", - "_type": "_doc", "_id": "1", "_version": 1, "_seq_no": 0, @@ -68,10 +69,9 @@ version_type | Enum | Retrieves a specifically typed document. Available options Field | Description :--- | :--- _index | The name of the index. -_type | The document's type. OpenSearch only supports one type, which is `_doc`. _id | The document's ID. _version | The document's version number. Updated whenever the document changes. -_seq_no | The sequnce number assigned when the document is indexed. +_seq_no | The sequence number assigned when the document is indexed. primary_term | The primary term assigned when the document is indexed. found | Whether the document exists. _routing | The shard that the document is routed to. If the document is not routed to a particular shard, this field is omitted. diff --git a/_opensearch/rest-api/document-apis/index-document.md b/_api-reference/document-apis/index-document.md similarity index 81% rename from _opensearch/rest-api/document-apis/index-document.md rename to _api-reference/document-apis/index-document.md index d83ba319..ec1664bc 100644 --- a/_opensearch/rest-api/document-apis/index-document.md +++ b/_api-reference/document-apis/index-document.md @@ -2,12 +2,13 @@ layout: default title: Index document parent: Document APIs -grand_parent: REST API reference nav_order: 1 +redirect_from: + - /opensearch/rest-api/document-apis/index-document/ --- # Index document -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple} Before you can search for data, you must first add documents. This operation adds a single document to your index. @@ -20,6 +21,7 @@ PUT sample-index/_doc/1 "Description": "To be or not to be, that is the question." } ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -41,9 +43,10 @@ Parameter | Type | Description | Required <_id> | String | A unique identifier to attach to the document. To automatically generate an ID, use `POST /doc` in your request instead of PUT. | No if_seq_no | Integer | Only perform the index operation if the document has the specified sequence number. | No if_primary_term | Integer | Only perform the index operation if the document has the specified primary term.| No -op_type | Enum | Specifies the type of operation to complete with the document. Valid values are `create` (create the index if it doesn't exist) and `index`. If a document ID is included in the request, then the default is `index`. Otherwise, the default is `create`. | No +op_type | Enum | Specifies the type of operation to complete with the document. Valid values are `create` (index a document only if it doesn't exist) and `index`. If a document ID is included in the request, then the default is `index`. Otherwise, the default is `create`. | No pipeline | String | Route the index operation to a certain pipeline. | No routing | String | value used to assign the index operation to a specific shard. | No +refresh | Enum | If true, OpenSearch refreshes shards to make the operation visible to searching. Valid options are `true`, `false`, and `wait_for`, which tells OpenSearch to wait for a refresh before executing the operation. Default is false. | No timeout | Time | How long to wait for a response from the cluster. Default is `1m`. | No version | Integer | The document's version number. | No version_type | Enum | Assigns a specific type to the document. Valid options are `external` (retrieve the document if the specified version number is greater than the document's current version) and `external_gte` (retrieve the document if the specified version number is greater than or equal to the document's current version). For example, to index version 3 of a document, use `/_doc/1?version=3&version_type=external`. | No @@ -64,7 +67,6 @@ Your request body must contain the information you want to index. ```json { "_index": "sample-index", - "_type": "_doc", "_id": "1", "_version": 1, "result": "created", @@ -83,13 +85,12 @@ Your request body must contain the information you want to index. Field | Description :--- | :--- _index | The name of the index. -_type | The document's type. OpenSearch supports only one type, which is `_doc`. _id | The document's ID. _version | The document's version. result | The result of the index operation. _shards | Detailed information about the cluster's shards. total | The total number of shards. -successful | The number of shards OpenSearch succssfully added the document to. -failed | The number of shards OpenSearch failed to added the document to. +successful | The number of shards OpenSearch successfully added the document to. +failed | The number of shards OpenSearch failed to add the document to. _seq_no | The sequence number assigned when the document was indexed. _primary_term | The primary term assigned when the document was indexed. diff --git a/_opensearch/rest-api/document-apis/index.md b/_api-reference/document-apis/index.md similarity index 86% rename from _opensearch/rest-api/document-apis/index.md rename to _api-reference/document-apis/index.md index 7c8eda9a..9444efa5 100644 --- a/_opensearch/rest-api/document-apis/index.md +++ b/_api-reference/document-apis/index.md @@ -1,14 +1,15 @@ --- layout: default title: Document APIs -parent: REST API reference has_children: true -nav_order: 7 +nav_order: 25 redirect_from: - - /opensearch/rest-api/document-apis/ + - /opensearch/rest-api/document-apis/index/ --- # Document APIs +**Introduced 1.0** +{: .label .label-purple } The document APIs allow you to handle documents relative to your index, such as adding, updating, and deleting documents. diff --git a/_opensearch/rest-api/document-apis/multi-get.md b/_api-reference/document-apis/multi-get.md similarity index 88% rename from _opensearch/rest-api/document-apis/multi-get.md rename to _api-reference/document-apis/multi-get.md index d3d647af..16e9ceeb 100644 --- a/_opensearch/rest-api/document-apis/multi-get.md +++ b/_api-reference/document-apis/multi-get.md @@ -2,63 +2,24 @@ layout: default title: Multi-get document parent: Document APIs -grand_parent: REST API reference nav_order: 30 +redirect_from: + - /opensearch/rest-api/document-apis/multi-get/ --- # Multi-get documents -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } -The multi-get operation allows you to execute multiple GET operations in one request, so you can get back all documents that match your criteria. - -## Example without specifying index in URL - -```json -GET _mget -{ - "docs": [ - { - "_index": "sample-index1", - "_id": "1" - }, - { - "_index": "sample-index2", - "_id": "1", - "_source": { - "include": ["Length"] - } - } - ] -} -``` - -## Example of specifying index in URL - -```json -GET sample-index1/_mget - -{ - "docs": [ - { - "_type": "_doc", - "_id": "1", - "_source": false - }, - { - "_type": "_doc", - "_id": "2", - "_source": [ "Director", "Title" ] - } - ] -} -``` +The multi-get operation allows you to run multiple GET operations in one request, so you can get back all documents that match your criteria. ## Path and HTTP methods ``` GET _mget GET /_mget +POST _mget +POST /_mget ``` ## URL parameters @@ -79,7 +40,7 @@ _source_includes | String | A comma-separated list of source fields to include i ## Request body -If you don't specify an index in your request's URL, you must specify your target indices and the relevant document IDs in the request body. Other fields are optional. +If you don't specify an index in your request's URL, you must specify your target indexes and the relevant document IDs in the request body. Other fields are optional. Field | Type | Description | Required :--- | :--- | :--- | :--- @@ -92,13 +53,54 @@ _source.includes | Array | Specifies which fields to include in the query respon _source.excludes | Array | Specifies which fields to exclude in the query response. For example, `"_source": { "exclude": ["Director"] }` excludes `Director` from the query response. | No ids | Array | IDs of the documents to retrieve. Only allowed when an index is specified in the URL. | No -## Response + +#### Example without specifying index in URL + +```json +GET _mget +{ + "docs": [ + { + "_index": "sample-index1", + "_id": "1" + }, + { + "_index": "sample-index2", + "_id": "1", + "_source": { + "include": ["Length"] + } + } + ] +} +``` +{% include copy-curl.html %} + +#### Example of specifying index in URL + +```json +GET sample-index1/_mget +{ + "docs": [ + { + "_id": "1", + "_source": false + }, + { + "_id": "2", + "_source": [ "Director", "Title" ] + } + ] +} +``` +{% include copy-curl.html %} + +#### Example Response ```json { "docs": [ { "_index": "sample-index1", - "_type": "_doc", "_id": "1", "_version": 4, "_seq_no": 5, @@ -111,7 +113,6 @@ ids | Array | IDs of the documents to retrieve. Only allowed when an index is sp }, { "_index": "sample-index2", - "_type": "_doc", "_id": "1", "_version": 1, "_seq_no": 6, @@ -131,10 +132,9 @@ ids | Array | IDs of the documents to retrieve. Only allowed when an index is sp Field | Description :--- | :--- _index | The name of the index. -_type | The document's type. OpenSearch only supports one type, which is `_doc`. _id | The document's ID. _version | The document's version number. Updated whenever the document changes. -_seq_no | The sequnce number assigned when the document is indexed. +_seq_no | The sequence number assigned when the document is indexed. primary_term | The primary term assigned when the document is indexed. found | Whether the document exists. _routing | The shard that the document is routed to. If the document is not routed to a particular shard, this field is omitted. diff --git a/_opensearch/rest-api/document-apis/reindex.md b/_api-reference/document-apis/reindex.md similarity index 89% rename from _opensearch/rest-api/document-apis/reindex.md rename to _api-reference/document-apis/reindex.md index 5ad961a8..766f5b28 100644 --- a/_opensearch/rest-api/document-apis/reindex.md +++ b/_api-reference/document-apis/reindex.md @@ -1,16 +1,18 @@ --- layout: default -title: Reindex +title: Reindex document parent: Document APIs -grand_parent: REST API reference nav_order: 60 +redirect_from: + - /opensearch/reindex-data/ + - /opensearch/rest-api/document-apis/reindex/ --- -# Index document -Introduced 1.0 +# Reindex document +**Introduced 1.0** {: .label .label-purple} -The reindex API operation lets you copy all or a subset of your data from a source index into a destination index. +The reindex document API operation lets you copy all or a subset of your data from a source index into a destination index. ## Example @@ -25,6 +27,7 @@ POST /_reindex } } ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -66,8 +69,8 @@ password | Password to authenticate with the remote cluster. socket_timeout | The wait time for socket reads. Default is 30s. connect_timeout | The wait time for remote connection timeouts. Default is 30s. size | The number of documents to reindex. -slice | Whether to manually or automatically slice the reindex operation so it executes in parallel. -_source | Whether to reindex source fields. Speicfy a list of fields to reindex or true to reindex all fields. Default is true. +slice | Whether to manually or automatically slice the reindex operation so it executes in parallel. Setting this field to `auto` allows OpenSearch to control the number of slices to use, which is one slice per shard, up to a maximum of 20. If there are multiple sources, the number of slices used are based on the index or backing index with the smallest number of shards. +_source | Whether to reindex source fields. Specify a list of fields to reindex or true to reindex all fields. Default is true. id | The ID to associate with manual slicing. max | Maximum number of slices. dest | Information about the destination index. Valid values are `index`, `version_type`, and `op_type`. diff --git a/_opensearch/rest-api/document-apis/update-by-query.md b/_api-reference/document-apis/update-by-query.md similarity index 84% rename from _opensearch/rest-api/document-apis/update-by-query.md rename to _api-reference/document-apis/update-by-query.md index bb7ebe2b..b53c3193 100644 --- a/_opensearch/rest-api/document-apis/update-by-query.md +++ b/_api-reference/document-apis/update-by-query.md @@ -2,12 +2,13 @@ layout: default title: Update by query parent: Document APIs -grand_parent: REST API reference nav_order: 50 +redirect_from: + - /opensearch/rest-api/document-apis/update-by-query/ --- # Update by query -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple} You can include a query and a script as part of your update request so OpenSearch can run the script to update all of the documents that match the query. @@ -31,6 +32,7 @@ POST test-index1/_update_by_query } } ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -44,30 +46,30 @@ All URL parameters are optional. Parameter | Type | Description :--- | :--- | :--- | :--- -<index> | String | Comma-separated list of indices to update. To update all indices, use * or omit this parameter. -allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indices. Default is `true`. +<index> | String | Comma-separated list of indexes to update. To update all indexes, use * or omit this parameter. +allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. analyzer | String | Analyzer to use in the query string. analyze_wildcard | Boolean | Whether the update operation should include wildcard and prefix queries in the analysis. Default is false. conflicts | String | Indicates to OpenSearch what should happen if the update by query operation runs into a version conflict. Valid options are `abort` and `proceed`. Default is `abort`. default_operator | String | Indicates whether the default operator for a string query should be `AND` or `OR`. Default is `OR`. df | String | The default field if a field prefix is not provided in the query string. -expand_wildcards | String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values. Valid values are `all` (match any index), `open` (match open, non-hidden indices), `closed` (match closed, non-hidden indices), `hidden` (match hidden indices), and `none` (deny wildcard expressions). Default is `open`. +expand_wildcards | String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values. Valid values are `all` (match any index), `open` (match open, non-hidden indexes), `closed` (match closed, non-hidden indexes), `hidden` (match hidden indexes), and `none` (deny wildcard expressions). Default is `open`. from | Integer | The starting index to search from. Default is 0. -ignore_unavailable | Boolean | Whether to exclude missing or closed indices in the response. Default is false. +ignore_unavailable | Boolean | Whether to exclude missing or closed indexes in the response. Default is false. lenient | Boolean | Specifies whether OpenSearch should accept requests if queries have format errors (for example, querying a text field for an integer). Default is false. max_docs | Integer | How many documents the update by query operation should process at most. Default is all documents. pipeline | String | ID of the pipeline to use to process documents. preference | String | Specifies which shard or node OpenSearch should perform the update by query operation on. q | String | Lucene query string's query. request_cache | Boolean | Specifies whether OpenSearch should use the request cache. Default is whether it’s enabled in the index’s settings. -refresh | Boolean | If true, OpenSearch refreshes shards to make the update by query operation available to search results. Valid options are `true`, `false`, and `wait_for`, which tells OpenSearch to wait for a refresh before executing the operation. Default is `false`. +refresh | Boolean | If true, OpenSearch refreshes shards to make the update by query operation available to search results. Valid options are `true` and `false`. Default is `false`. requests_per_second | Integer | Specifies the request's throttling in sub-requests per second. Default is -1, which means no throttling. routing | String | Value used to route the update by query operation to a specific shard. scroll | Time | How long to keep the search context open. scroll_size | Integer | Size of the operation's scroll request. Default is 1000. -search_type | String | Whether OpenSearch should use global term and document frequencies calculating revelance scores. Valid choices are `query_then_fetch` and `dfs_query_then_fetch`. `query_then_fetch` scores documents using local term and document frequencies for the shard. It’s usually faster but less accurate. `dfs_query_then_fetch` scores documents using global term and document frequencies across all shards. It’s usually slower but more accurate. Default is `query_then_fetch`. +search_type | String | Whether OpenSearch should use global term and document frequencies calculating relevance scores. Valid choices are `query_then_fetch` and `dfs_query_then_fetch`. `query_then_fetch` scores documents using local term and document frequencies for the shard. It’s usually faster but less accurate. `dfs_query_then_fetch` scores documents using global term and document frequencies across all shards. It’s usually slower but more accurate. Default is `query_then_fetch`. search_timeout | Time | How long to wait until OpenSearch deems the request timed out. Default is no timeout. -slices | Integer | Number of sub-tasks OpenSearch should divide this task into. Default is 1, which means OpenSearch should not divide this task. +slices | String or integer | The number slices to split an operation into for faster processing, specified by integer. When set to `auto` OpenSearch it should decides how many the number of slices for the operation. Default is `1`, which indicates an operation will not be split. sort | List | A comma-separated list of <field> : <direction> pairs to sort by. _source | String | Whether to include the `_source` field in the response. _source_excludes | String | A comma-separated list of source fields to exclude from the response. @@ -77,10 +79,11 @@ terminate_after | Integer | The maximum number of documents OpenSearch should pr timeout | Time | How long the operation should wait from a response from active shards. Default is `1m`. version | Boolean | Whether to include the document version as a match. wait_for_active_shards | String | The number of shards that must be active before OpenSearch executes the operation. Valid values are `all` or any integer up to the total number of shards in the index. Default is 1, which is the primary shard. +wait_for_completion | boolean | When set to `false`, the response body includes a task ID and OpenSearch executes the operation asynchronously. The task ID can be used to check the status of the task or to cancel the task. Default is set to `true`. ## Request body -To update your indices and documents by query, you must include a [query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index) and a script in the request body that OpenSearch can run to update your documents. If you don't specify a query, then every document in the index gets updated. +To update your indexes and documents by query, you must include a [query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index) and a script in the request body that OpenSearch can run to update your documents. If you don't specify a query, then every document in the index gets updated. ```json { diff --git a/_api-reference/document-apis/update-document.md b/_api-reference/document-apis/update-document.md new file mode 100644 index 00000000..365cb3aa --- /dev/null +++ b/_api-reference/document-apis/update-document.md @@ -0,0 +1,228 @@ +--- +layout: default +title: Update document +parent: Document APIs +nav_order: 10 +redirect_from: + - /opensearch/rest-api/document-apis/update-document/ +--- + +# Update document +**Introduced 1.0** +{: .label .label-purple } + +If you need to update a document's fields in your index, you can use the update document API operation. You can do so by specifying the new data you want to be in your index or by including a script in your request body, which OpenSearch runs to update the document. By default, the update operation only updates a document that exists in the index. If a document does not exist, the API returns an error. To _upsert_ a document (update the document that exists or index a new one), use the [upsert](#upsert) operation. + +## Example + +```json +POST /sample-index1/_update/1 +{ + "doc": { + "first_name" : "Bruce", + "last_name" : "Wayne" + } +} +``` +{% include copy-curl.html %} + +## Script example + +```json +POST /test-index1/_update/1 +{ + "script" : { + "source": "ctx._source.secret_identity = \"Batman\"" + } +} +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +POST //_update/<_id> +``` + +## URL parameters + +Parameter | Type | Description | Required +:--- | :--- | :--- | :--- +<index> | String | Name of the index. | Yes +<_id> | String | The ID of the document to update. | Yes +if_seq_no | Integer | Only perform the update operation if the document has the specified sequence number. | No +if_primary_term | Integer | Perform the update operation if the document has the specified primary term. | No +lang | String | Language of the script. Default is `painless`. | No +require_alias | Boolean | Specifies whether the destination must be an index alias. Default is false. | No +refresh | Enum | If true, OpenSearch refreshes shards to make the operation visible to searching. Valid options are `true`, `false`, and `wait_for`, which tells OpenSearch to wait for a refresh before executing the operation. Default is `false`. | No +retry_on_conflict | Integer | The amount of times OpenSearch should retry the operation if there's a document conflict. Default is 0. | No +routing | String | Value to route the update operation to a specific shard. | No +_source | Boolean or List | Whether or not to include the `_source` field in the response body. Default is `false`. This parameter also supports a comma-separated list of source fields for including multiple source fields in the query response. | No +_source_excludes | List | A comma-separated list of source fields to exclude in the query response. | No +_source_includes | List | A comma-separated list of source fields to include in the query response. | No +timeout | Time | How long to wait for a response from the cluster. | No +wait_for_active_shards | String | The number of active shards that must be available before OpenSearch processes the update request. Default is 1 (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the operation to succeed. | No + +## Request body + +Your request body must contain the information with which you want to update your document. If you only want to replace certain fields in your document, your request body must include a `doc` object containing the fields that you want to update: + +```json +{ + "doc": { + "first_name": "Thomas", + "last_name": "Wayne" + } +} +``` + +You can also use a script to tell OpenSearch how to update your document: + +```json +{ + "script" : { + "source": "ctx._source.oldValue += params.newValue", + "lang": "painless", + "params" : { + "newValue" : 10 + } + } +} +``` + +## Upsert + +Upsert is an operation that conditionally either updates an existing document or inserts a new one based on information in the object. + +In the following example, the `upsert` operation updates the `first_name` and `last_name` fields if a document already exists. If a document does not exist, a new one is indexed using content in the `upsert` object. + +```json +POST /sample-index1/_update/1 +{ + "doc": { + "first_name": "Martha", + "last_name": "Rivera" + }, + "upsert": { + "last_name": "Oliveira", + "age": "31" + } +} +``` + +Consider an index that contains the following document: + +```json +{ + "_index": "sample-index1", + "_id": "1", + "_score": 1, + "_source": { + "first_name": "Bruce", + "last_name": "Wayne" + } +} +``` + +After the upsert operation, the document's `first_name` and `last_name` fields are updated: + +```json +{ + "_index": "sample-index1", + "_id": "1", + "_score": 1, + "_source": { + "first_name": "Martha", + "last_name": "Rivera" + } +} +``` + +If the document does not exist in the index, a new document is indexed with the fields specified in the `upsert` object: + +```json +{ + "_index": "sample-index1", + "_id": "1", + "_score": 1, + "_source": { + "last_name": "Oliveira", + "age": "31" + } +} +``` + +You can also add `doc_as_upsert` to the request and set it to `true` to use the information in the `doc` field for performing the upsert operation: + +```json +POST /sample-index1/_update/1 +{ + "doc": { + "first_name": "Martha", + "last_name": "Oliveira", + "age": "31" + }, + "doc_as_upsert": true +} +``` + +Consider an index that contains the following document: + +```json +{ + "_index": "sample-index1", + "_id": "1", + "_score": 1, + "_source": { + "first_name": "Bruce", + "last_name": "Wayne" + } +} +``` + +After the upsert operation, the document's `first_name` and `last_name` fields are updated and an `age` field is added. If the document does not exist in the index, a new document is indexed with the fields specified in the `upsert` object. In both cases, the document is as follows: + +```json +{ + "_index": "sample-index1", + "_id": "1", + "_score": 1, + "_source": { + "first_name": "Martha", + "last_name": "Oliveira", + "age": "31" + } +} +``` + +## Response +```json +{ + "_index": "sample-index1", + "_id": "1", + "_version": 3, + "result": "updated", + "_shards": { + "total": 2, + "successful": 2, + "failed": 0 + }, + "_seq_no": 4, + "_primary_term": 17 +} +``` + +## Response body fields + +Field | Description +:--- | :--- +_index | The name of the index. +_id | The document's ID. +_version | The document's version. +result | The result of the update operation. +_shards | Detailed information about the cluster's shards. +total | The total number of shards. +successful | The number of shards OpenSearch successfully updated the document in. +failed | The number of shards OpenSearch failed to update the document in. +_seq_no | The sequence number assigned when the document was indexed. +_primary_term | The primary term assigned when the document was indexed. diff --git a/_opensearch/rest-api/explain.md b/_api-reference/explain.md similarity index 92% rename from _opensearch/rest-api/explain.md rename to _api-reference/explain.md index cbc7c5b0..57b7d9fa 100644 --- a/_opensearch/rest-api/explain.md +++ b/_api-reference/explain.md @@ -1,17 +1,18 @@ --- layout: default title: Explain -parent: REST API reference -nav_order: 140 +nav_order: 30 +redirect_from: + - /opensearch/rest-api/explain/ --- # Explain -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } Wondering why a specific document ranks higher (or lower) for a query? You can use the explain API for an explanation of how the relevance score (`_score`) is calculated for every result. -OpenSearch uses a probabilistic ranking framework called [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) to calculate relevance scores. Okapi BM25 is based on the original [TF/IDF](http://lucene.apache.org/core/{{site.lucene_version}}/core/org/apache/lucene/search/package-summary.html#scoring) framework used by Apache Lucene. +OpenSearch uses a probabilistic ranking framework called [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) to calculate relevance scores. Okapi BM25 is based on the original [TF/IDF](https://lucene.apache.org/core/{{site.lucene_version}}/core/org/apache/lucene/search/package-summary.html#scoring) framework used by Apache Lucene. The explain API is an expensive operation in terms of both resources and time. On production clusters, we recommend using it sparingly for the purpose of troubleshooting. {: .warning } @@ -22,7 +23,7 @@ The explain API is an expensive operation in terms of both resources and time. O To see the explain output for all results, set the `explain` flag to `true` either in the URL or in the body of the request: ```json -POST kibana_sample_data_ecommerce/_search?explain=true +POST opensearch_dashboards_sample_data_ecommerce/_search?explain=true { "query": { "match": { @@ -31,11 +32,12 @@ POST kibana_sample_data_ecommerce/_search?explain=true } } ``` +{% include copy-curl.html %} More often, you want the output for a single document. In that case, specify the document ID in the URL: ```json -POST kibana_sample_data_ecommerce/_explain/EVz1Q3sBgg5eWQP6RSte +POST opensearch_dashboards_sample_data_ecommerce/_explain/EVz1Q3sBgg5eWQP6RSte { "query": { "match": { @@ -44,6 +46,7 @@ POST kibana_sample_data_ecommerce/_explain/EVz1Q3sBgg5eWQP6RSte } } ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -78,7 +81,6 @@ Parameter | Type | Description | Required ```json { "_index" : "kibana_sample_data_ecommerce", - "_type" : "_doc", "_id" : "EVz1Q3sBgg5eWQP6RSte", "matched" : true, "explanation" : { @@ -158,6 +160,6 @@ Term frequency (`tf`) | How many times the term appears in a field for a given d Inverse document frequency (`idf`) | How often the term appears within the index (across all the documents). The more often the term appears the lower is the relevance score. Field normalization factor (`fieldNorm`) | The length of the field. OpenSearch assigns a higher relevance score to a term appearing in a relatively short field. -The `tf`, `idf`, and `fieldNorm` values are calculated and stored at index time when a document is added or updated. The values might have some (typically small) inaccuracies as it’s based on summing the samples returned from each shard. +The `tf`, `idf`, and `fieldNorm` values are calculated and stored at index time when a document is added or updated. The values might have some (typically small) inaccuracies as it’s based on summing the samples returned from each shard. Individual queries include other factors for calculating the relevance score, such as term proximity, fuzziness, and so on. diff --git a/_opensearch/rest-api/alias.md b/_api-reference/index-apis/alias.md similarity index 81% rename from _opensearch/rest-api/alias.md rename to _api-reference/index-apis/alias.md index 99861d6a..a38a3798 100644 --- a/_opensearch/rest-api/alias.md +++ b/_api-reference/index-apis/alias.md @@ -1,15 +1,18 @@ --- layout: default title: Alias -parent: REST API reference +parent: Index APIs nav_order: 5 +redirect_from: + - /opensearch/rest-api/alias/ + - /api-reference/alias/ --- # Alias -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } -An alias is a virtual pointer that you can use to reference one or more indices. Creating and updating aliases are atomic operations, so you can reindex your data and point an alias at it without any downtime. +An alias is a virtual pointer that you can use to reference one or more indexes. Creating and updating aliases are atomic operations, so you can reindex your data and point an alias at it without any downtime. ## Example @@ -30,10 +33,10 @@ POST _aliases "alias": "old-index-alias" } } - ] } ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -47,7 +50,7 @@ All alias parameters are optional. Parameter | Data Type | Description :--- | :--- | :--- -master_timeout | Time | The amount of time to wait for a response from the master node. Default is `30s`. +cluster_manager_timeout | Time | The amount of time to wait for a response from the cluster manager node. Default is `30s`. timeout | Time | The amount of time to wait for a response from the cluster. Default is `30s`. ## Request body @@ -67,7 +70,7 @@ aliases | Array | Array of alias names. | Yes if you don't supply an `alias` fie filter | Object | A filter to use with the alias, so the alias points to a filtered part of the index. | No is_hidden | Boolean | Specifies whether the alias should be hidden from results that include wildcard expressions | No must_exist | Boolean | Specifies whether the alias to remove must exist. | No -is_write_index | Boolean | Specifies whether the index should be a write index. An alias can only have one write index at a time. | No +is_write_index | Boolean | Specifies whether the index should be a write index. An alias can only have one write index at a time. If a write request is submitted to a alias that links to multiple indexes, OpenSearch executes the request only on the write index. | No routing | String | Used to assign a custom value to a shard for specific operations. | No index_routing | String | Assigns a custom value to a shard only for index operations. | No search_routing | String | Assigns a custom value to a shard only for search operations. | No @@ -79,3 +82,5 @@ search_routing | String | Assigns a custom value to a shard only for search oper "acknowledged": true } ``` + +For more alias API operations, see [Index aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias/). \ No newline at end of file diff --git a/_api-reference/index-apis/clear-index-cache.md b/_api-reference/index-apis/clear-index-cache.md new file mode 100644 index 00000000..55a5ce85 --- /dev/null +++ b/_api-reference/index-apis/clear-index-cache.md @@ -0,0 +1,129 @@ +--- +layout: default +title: Clear cache +parent: Index APIs +nav_order: 10 +--- + +# Clear cache +**Introduced 1.0** +{: .label .label-purple } + +The clear cache API operation clears the caches of one or more indexes. For data streams, the API clears the caches of the stream’s backing indexes. + + +If you use the Security plugin, you must have the `manage index` privileges. +{: .note} + +## Path parameters + +| Parameter | Data type | Description | +:--- | :--- | :--- +| target | String | Comma-delimited list of data streams, indexes, and index aliases to which cache clearing is applied. Wildcard expressions (`*`) are supported. To target all data streams and indexes in a cluster, omit this parameter or use `_all` or `*`. Optional. | + + +## Query parameters + +All query parameters are optional. + +| Parameter | Data type | Description | +:--- | :--- | :--- +| allow_no_indices | Boolean | Whether to ignore wildcards, index aliases, or `_all` target (`target` path parameter) values that don’t match any indexes. If `false`, the request returns an error if any wildcard expression, index alias, or `_all` target value doesn't match any indexes. This behavior also applies if the request targets include other open indexes. For example, a request where the target is `fig*,app*` returns an error if an index starts with `fig` but no index starts with `app`. Defaults to `true`. | +| expand_wildcards | String | Determines the index types that wildcard expressions can expand to. Accepts multiple values separated by a comma, such as `open,hidden`. Valid values are:

`all` -- Expand to open, closed, and hidden indexes.

`open` -- Expand only to open indexes.

`closed` -- Expand only to closed indexes

`hidden` -- Expand to include hidden indexes. Must be combined with `open`, `closed`, or `both`.

`none` -- Expansions are not accepted.

Defaults to `open`. | +| fielddata | Boolean | If `true`, clears the fields cache. Use the `fields` parameter to clear specific fields' caches. Defaults to `true`. | +| fields | String | Used in conjunction with the `fielddata` parameter. Comma-delimited list of field names that are cleared out of the cache. Does not support objects or field aliases. Defaults to all fields. | +| file | Boolean | If `true`, clears the unused entries from the file cache on nodes with the Search role. Defaults to `false`. | +| index | String | Comma-delimited list of index names that are cleared out of the cache. | +| ignore_unavailable | Boolean | If `true`, OpenSearch ignores missing or closed indexes. Defaults to `false`. | +| query | Boolean | If `true`, clears the query cache. Defaults to `true`. | +| request | Boolean | If `true`, clears the request cache. Defaults to `true`. | + +#### Example requests + +The following example requests show multiple clear cache API uses. + +##### Clear a specific cache + +The following request clears the fields cache only: + +```json +POST /my-index/_cache/clear?fielddata=true +``` +{% include copy-curl.html %} + +
+ +The following request clears the query cache only: + +```json +POST /my-index/_cache/clear?query=true +``` +{% include copy-curl.html %} + +
+ +The following request clears the request cache only: + +```json +POST /my-index/_cache/clear?request=true +``` +{% include copy-curl.html %} + +#### Clear the cache for specific fields + +The following request clears the fields caches of `fielda` and `fieldb`: + +```json +POST /my-index/_cache/clear?fields=fielda,fieldb +``` +{% include copy-curl.html %} + +#### Clear caches for specific data streams or indexes + +The following request clears the cache for two specific indexes: + +```json +POST /my-index,my-index2/_cache/clear +``` +{% include copy-curl.html %} + +#### Clear caches for all data streams and indexes + +The following request clears the cache for all data streams and indexes: + +```json +POST /_cache/clear +``` +{% include copy-curl.html %} + +#### Clear unused entries from the cache on search-capable nodes + +```json +POST /*/_cache/clear?file=true +``` +{% include copy-curl.html %} + +#### Example response + +The `POST /books,hockey/_cache/clear` request returns the following fields: + +```json +{ + "_shards" : { + "total" : 4, + "successful" : 2, + "failed" : 0 + } +} +``` + +## Response fields + +The `POST /books,hockey/_cache/clear` request returns the following response fields: + +| Field | Data type | Description | +:--- | :--- | :--- +| _shards | Object | Shard information. | +| total | Integer | Total number of shards. | +| successful | Integer | Number of index shards with caches successfully cleared. | +| failed | Integer | Number of index shards with caches that failed to clear. | diff --git a/_api-reference/index-apis/clone.md b/_api-reference/index-apis/clone.md new file mode 100644 index 00000000..60228b58 --- /dev/null +++ b/_api-reference/index-apis/clone.md @@ -0,0 +1,77 @@ +--- +layout: default +title: Clone index +parent: Index APIs +nav_order: 15 +redirect_from: + - /opensearch/rest-api/index-apis/clone/ +--- + +# Clone index +**Introduced 1.0** +{: .label .label-purple } + +The clone index API operation clones all data in an existing read-only index into a new index. The new index cannot already exist. + +## Example + +```json +PUT /sample-index1/_clone/cloned-index1 +{ + "settings": { + "index": { + "number_of_shards": 2, + "number_of_replicas": 1 + } + }, + "aliases": { + "sample-alias1": {} + } +} +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +POST //_clone/ +PUT //_clone/ +``` + +## Index naming restrictions + +OpenSearch indexes have the following naming restrictions: + +- All letters must be lowercase. +- Index names can't begin with underscores (`_`) or hyphens (`-`). +- Index names can't contain spaces, commas, or the following characters: + + `:`, `"`, `*`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, or `<` + +## URL parameters + +Your request must include the source and target indexes. All other clone index parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +<source-index> | String | The source index to clone. +<target-index> | String | The index to create and add cloned data to. +wait_for_active_shards | String | The number of active shards that must be available before OpenSearch processes the request. Default is 1 (only the primary shard). Set to all or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the operation to succeed. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. +timeout | Time | How long to wait for the request to return. Default is `30s`. +wait_for_completion | Boolean | When set to `false`, the request returns immediately instead of after the operation is finished. To monitor the operation status, use the [Tasks API]({{site.url}}{{site.baseurl}}/api-reference/tasks/) with the task ID returned by the request. Default is `true`. +task_execution_timeout | Time | The explicit task execution timeout. Only useful when wait_for_completion is set to `false`. Default is `1h`. + +## Request body + +The clone index API operation creates a new target index, so you can specify any [index settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/) and [aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias/) to apply to the target index. + +## Response + +```json +{ + "acknowledged": true, + "shards_acknowledged": true, + "index": "cloned-index1" +} +``` diff --git a/_opensearch/rest-api/index-apis/close-index.md b/_api-reference/index-apis/close-index.md similarity index 72% rename from _opensearch/rest-api/index-apis/close-index.md rename to _api-reference/index-apis/close-index.md index 8c2d09a4..e8d2e3e1 100644 --- a/_opensearch/rest-api/index-apis/close-index.md +++ b/_api-reference/index-apis/close-index.md @@ -2,21 +2,23 @@ layout: default title: Close index parent: Index APIs -grand_parent: REST API reference -nav_order: 30 +nav_order: 20 +redirect_from: + - /opensearch/rest-api/index-apis/close-index/ --- # Close index -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } The close index API operation closes an index. Once an index is closed, you cannot add data to it or search for any data within the index. -## Example +#### Example ```json POST /sample-index/_close ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -30,12 +32,12 @@ All parameters are optional. Parameter | Type | Description :--- | :--- | :--- -<index-name> | String | The index to close. Can be a comma-separated list of multiple index names. Use `_all` or * to close all indices. -allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indices. Default is true. -expand_wildcards | String | Expands wildcard expressions to different indices. Combine multiple values with commas. Available values are all (match all indices), open (match open indices), closed (match closed indices), hidden (match hidden indices), and none (do not accept wildcard expressions). Default is open. -ignore_unavailable | Boolean | If true, OpenSearch does not search for missing or closed indices. Default is false. +<index-name> | String | The index to close. Can be a comma-separated list of multiple index names. Use `_all` or * to close all indexes. +allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indexes. Default is true. +expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are all (match all indexes), open (match open indexes), closed (match closed indexes), hidden (match hidden indexes), and none (do not accept wildcard expressions). Default is open. +ignore_unavailable | Boolean | If true, OpenSearch does not search for missing or closed indexes. Default is false. wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the request. Default is 1 (only the primary shard). Set to all or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the request to succeed. -master_timeout | Time | How long to wait for a connection to the master node. Default is `30s`. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. timeout | Time | How long to wait for a response from the cluster. Default is `30s`. diff --git a/_api-reference/index-apis/create-index.md b/_api-reference/index-apis/create-index.md new file mode 100644 index 00000000..5e0d504f --- /dev/null +++ b/_api-reference/index-apis/create-index.md @@ -0,0 +1,77 @@ +--- +layout: default +title: Create index +parent: Index APIs +nav_order: 25 +redirect_from: + - /opensearch/rest-api/index-apis/create-index/ + - /opensearch/rest-api/create-index/ +--- + +# Create index +**Introduced 1.0** +{: .label .label-purple } + +While you can create an index by using a document as a base, you can also create an empty index for later use. + +When creating an index, you can specify its mappings, settings, and aliases. + +## Path and HTTP methods + +``` +PUT +``` + +## Index naming restrictions + +OpenSearch indexes have the following naming restrictions: + +- All letters must be lowercase. +- Index names can't begin with underscores (`_`) or hyphens (`-`). +- Index names can't contain spaces, commas, or the following characters: + + `:`, `"`, `*`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, or `<` + +## Path parameters + +| Parameter | Description | +:--- | :--- +| index | String | The index name. Must conform to the [index naming restrictions](#index-naming-restrictions). Required. | + +## Query parameters + +You can include the following query parameters in your request. All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the request. Default is 1 (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the request to succeed. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. +timeout | Time | How long to wait for the request to return. Default is `30s`. + +## Request body + +As part of your request, you can optionally specify [index settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/), [mappings]({{site.url}}{{site.baseurl}}/field-types/index/), and [aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias/) for your newly created index. + +#### Example request + +```json +PUT /sample-index1 +{ + "settings": { + "index": { + "number_of_shards": 2, + "number_of_replicas": 1 + } + }, + "mappings": { + "properties": { + "age": { + "type": "integer" + } + } + }, + "aliases": { + "sample-alias1": {} + } +} +``` diff --git a/_api-reference/index-apis/dangling-index.md b/_api-reference/index-apis/dangling-index.md new file mode 100644 index 00000000..9d40687f --- /dev/null +++ b/_api-reference/index-apis/dangling-index.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Dangling indexes +parent: Index APIs +nav_order: 30 +--- + +# Dangling indexes API +**Introduced 1.0** +{: .label .label-purple } + +After a node joins a cluster, dangling indexes occur if any shards exist in the node's local directory that do not already exist in the cluster. Dangling indexes can be listed, deleted, or imported. + +## Path and HTTP methods + +List dangling indexes: + +``` +GET /_dangling +``` + +Import a dangling index: + +``` +POST /_dangling/ +``` + +Delete a dangling index: + +``` +DELETE /_dangling/ +``` + +## Path parameters + +Path parameters are required. + +Path parameter | Description +:--- | :--- +index-uuid | UUID of index. + +## Query parameters + +Query parameters are optional. + +Query parameter | Data type | Description +:--- | :--- | :--- +accept_data_loss | Boolean | Must be set to `true` for an `import` or `delete` because OpenSearch is unaware of where the dangling index data came from. +timeout | Time units | The amount of time to wait for a response. If no response is received in the defined time period, an error is returned. Default is `30` seconds. +cluster_manager_timeout | Time units | The amount of time to wait for a connection to the cluster manager. If no response is received in the defined time period, an error is returned. Default is `30` seconds. + +## Examples + +The following are example requests and a example response. + +#### Sample list + +````bash +GET /_dangling +```` +{% include copy-curl.html %} + +#### Sample import + +````bash +POST /_dangling/msdjernajxAT23RT-BupMB?accept_data_loss=true +```` +{% include copy-curl.html %} + +#### Sample delete + +````bash +DELETE /_dangling/msdjernajxAT23RT-BupMB?accept_data_loss=true +```` + +#### Example response body + +````json +{ + "_nodes": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "cluster_name": "opensearch-cluster", + "dangling_indices": [msdjernajxAT23RT-BupMB] +} +```` diff --git a/_opensearch/rest-api/index-apis/delete-index.md b/_api-reference/index-apis/delete-index.md similarity index 61% rename from _opensearch/rest-api/index-apis/delete-index.md rename to _api-reference/index-apis/delete-index.md index ab6377a0..5973a8cf 100644 --- a/_opensearch/rest-api/index-apis/delete-index.md +++ b/_api-reference/index-apis/delete-index.md @@ -2,12 +2,13 @@ layout: default title: Delete index parent: Index APIs -grand_parent: REST API reference -nav_order: 10 +nav_order: 35 +redirect_from: + - /opensearch/rest-api/index-apis/delete-index/ --- # Delete index -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } If you no longer need an index, you can use the delete index API operation to delete it. @@ -17,6 +18,7 @@ If you no longer need an index, you can use the delete index API operation to de ```json DELETE /sample-index ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -30,10 +32,10 @@ All parameters are optional. Parameter | Type | Description :--- | :--- | :--- -allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indices. Default is true. -expand_wildcards | String | Expands wildcard expressions to different indices. Combine multiple values with commas. Available values are all (match all indices), open (match open indices), closed (match closed indices), hidden (match hidden indices), and none (do not accept wildcard expressions), which must be used with open, closed, or both. Default is open. -ignore_unavailable | Boolean | If true, OpenSearch does not include missing or closed indices in the response. -master_timeout | Time | How long to wait for a connection to the master node. Default is `30s`. +allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indexes. Default is true. +expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are all (match all indexes), open (match open indexes), closed (match closed indexes), hidden (match hidden indexes), and none (do not accept wildcard expressions), which must be used with open, closed, or both. Default is open. +ignore_unavailable | Boolean | If true, OpenSearch does not include missing or closed indexes in the response. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. timeout | Time | How long to wait for the response to return. Default is `30s`. diff --git a/_opensearch/rest-api/index-apis/exists.md b/_api-reference/index-apis/exists.md similarity index 73% rename from _opensearch/rest-api/index-apis/exists.md rename to _api-reference/index-apis/exists.md index 3a156277..6d439a96 100644 --- a/_opensearch/rest-api/index-apis/exists.md +++ b/_api-reference/index-apis/exists.md @@ -2,12 +2,13 @@ layout: default title: Index exists parent: Index APIs -grand_parent: REST API reference -nav_order: 5 +nav_order: 50 +redirect_from: + - /opensearch/rest-api/index-apis/exists/ --- # Index exists -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } The index exists API operation returns whether or not an index already exists. @@ -17,6 +18,7 @@ The index exists API operation returns whether or not an index already exists. ```json HEAD /sample-index ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -30,12 +32,12 @@ All parameters are optional. Parameter | Type | Description :--- | :--- | :--- -allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indices. Default is true. -expand_wildcards | String | Expands wildcard expressions to different indices. Combine multiple values with commas. Available values are all (match all indices), open (match open indices), closed (match closed indices), hidden (match hidden indices), and none (do not accept wildcard expressions). Default is open. +allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indexes. Default is true. +expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are all (match all indexes), open (match open indexes), closed (match closed indexes), hidden (match hidden indexes), and none (do not accept wildcard expressions). Default is open. flat_settings | Boolean | Whether to return settings in the flat form, which can improve readability, especially for heavily nested settings. For example, the flat form of "index": { "creation_date": "123456789" } is "index.creation_date": "123456789". include_defaults | Boolean | Whether to include default settings as part of the response. This parameter is useful for identifying the names and current values of settings you want to update. -ignore_unavailable | Boolean | If true, OpenSearch does not search for missing or closed indices. Default is false. -local | Boolean | Whether to return information from only the local node instead of from the master node. Default is false. +ignore_unavailable | Boolean | If true, OpenSearch does not search for missing or closed indexes. Default is false. +local | Boolean | Whether to return information from only the local node instead of from the cluster manager node. Default is false. ## Response diff --git a/_api-reference/index-apis/force-merge.md b/_api-reference/index-apis/force-merge.md new file mode 100644 index 00000000..6ad2e7f2 --- /dev/null +++ b/_api-reference/index-apis/force-merge.md @@ -0,0 +1,125 @@ +--- +layout: default +title: Force merge +parent: Index APIs +nav_order: 37 +--- + +# Force merge +**Introduced 1.0** +{: .label .label-purple } + +The force merge API operation forces a merge on the shards of one or more indexes. For a data stream, the API forces a merge on the shards of the stream's backing index. + +## The merge operation + +In OpenSearch, a shard is a Lucene index, which consists of _segments_ (or segment files). Segments store the indexed data. Periodically, smaller segments are merged into larger ones and the larger segments become immutable. Merging reduces the overall number of segments on each shard and frees up disk space. + +OpenSearch performs background segment merges that produce segments no larger than `index.merge.policy.max_merged_segment` (the default is 5 GB). + +## Deleted documents + +When a document is deleted from an OpenSearch index, it is not deleted from the Lucene segment but is rather only marked to be deleted. When the segment files are merged, deleted documents are removed (or _expunged_). Thus, merging also frees up space occupied by documents marked as deleted. + +## Force Merge API + +In addition to periodic merging, you can force a segment merge using the Force Merge API. + +Use the Force Merge API on an index only after all write requests sent to the index are completed. The force merge operation can produce very large segments. If write requests are still sent to the index, then the merge policy does not merge these segments until they primarily consist of deleted documents. This can increase disk space usage and lead to performance degradation. +{: .warning} + +When you call the Force Merge API, the call is blocked until merge completion. If during this time the connection is lost, the force merge operation continues in the background. New force merge requests sent to the same index will be blocked until the currently running merge operation is complete. + +## Force merging multiple indexes + +To force merge multiple indexes, you can call the Force Merge API on the following index combinations: + +- Multiple indexes +- One or more data streams containing multiple backing indexes +- One or more index aliases pointing to multiple indexes +- All data streams and indexes in a cluster + +When you force merge multiple indexes, the merge operation is executed on each shard of a node sequentially. When the force merge operation is in progress, the storage for the shard temporarily increases so that all segments can be rewritten into a new segment. When `max_num_segments` is set to `1`, the storage for the shard temporarily doubles. + +## Force merging data streams + +It can be useful to force merge data streams in order to manage a data stream's backing indexes, especially after a rollover operation. Time-based indexes receive indexing requests only during a specified time period. Once that time period has elapsed and the index receives no more write requests, you can force merge segments of all index shards into one segment. Searches on single-segment shards are more efficient because they use simpler data structures. + +## Path and HTTP methods + +```json +POST /_forcemerge +POST //_forcemerge/ +``` + +## Path parameters + +The following table lists the available path parameters. All path parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `` | String | A comma-separated list of indexes, data streams, or index aliases to which the operation is applied. Supports wildcard expressions (`*`). Use `_all` or `*` to specify all indexes and data streams in a cluster. | + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `allow_no_indices` | Boolean | If `false`, the request returns an error if any wildcard expression or index alias targets any closed or missing indexes. Default is `true`. | +| `expand_wildcards` | String | Specifies the types of indexes to which wildcard expressions can expand. Supports comma-separated values. Valid values are:
- `all`: Expand to all open and closed indexes, including hidden indexes.
- `open`: Expand to open indexes.
- `closed`: Expand to closed indexes.
- `hidden`: Include hidden indexes when expanding. Must be combined with `open`, `closed`, or both.
- `none`: Do not accept wildcard expressions.
Default is `open`. | +| `flush` | Boolean | Performs a flush on the indexes after the force merge. A flush ensures that the files are persisted to disk. Default is `true`. | +| `ignore_unavailable` | Boolean | If `true`, OpenSearch ignores missing or closed indexes. If `false`, OpenSearch returns an error if the force merge operation encounters missing or closed indexes. Default is `false`. | +| `max_num_segments` | Integer | The number of larger segments into which smaller segments are merged. Set this parameter to `1` to merge all segments into one segment. The default behavior is to perform the merge as necessary. | +| `only_expunge_deletes` | Boolean | If `true`, the merge operation only expunges segments containing a certain percentage of deleted documents. The percentage is 10% by default and is configurable in the `index.merge.policy.expunge_deletes_allowed` setting. Prior to OpenSearch 2.12, `only_expunge_deletes` ignored the `index.merge.policy.max_merged_segment` setting. Starting with OpenSearch 2.12, using `only_expunge_deletes` does not produce segments larger than `index.merge.policy.max_merged_segment` (by default, 5 GB). For more information, see [Deleted documents](#deleted-documents). Default is `false`. | + +#### Example request: Force merge a specific index + +```json +POST /testindex1/_forcemerge +``` +{% include copy-curl.html %} + +#### Example request: Force merge multiple indexes + +```json +POST /testindex1,testindex2/_forcemerge +``` +{% include copy-curl.html %} + +#### Example request: Force merge all indexes + +```json +POST /_forcemerge +``` +{% include copy-curl.html %} + +#### Example request: Force merge a data stream's backing indexes into one segment + +```json +POST /.testindex-logs/_forcemerge?max_num_segments=1 +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + } +} +``` + +## Response fields + +The following table lists all response fields. + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `shards` | Object | Contains information about the shards on which the request was executed. | +| `shards.total` | Integer | The number of shards on which the operation was executed. | +| `shards.successful` | Integer | The number of shards on which the operation was successful. | +| `shards.failed` | Integer | The number of shards on which the operation failed. | diff --git a/_opensearch/rest-api/index-apis/get-index.md b/_api-reference/index-apis/get-index.md similarity index 77% rename from _opensearch/rest-api/index-apis/get-index.md rename to _api-reference/index-apis/get-index.md index 7e37a6ee..899e82e9 100644 --- a/_opensearch/rest-api/index-apis/get-index.md +++ b/_api-reference/index-apis/get-index.md @@ -2,12 +2,13 @@ layout: default title: Get index parent: Index APIs -grand_parent: REST API reference -nav_order: 20 +nav_order: 40 +redirect_from: + - /opensearch/rest-api/index-apis/get-index/ --- # Get index -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } You can use the get index API operation to return information about an index. @@ -17,6 +18,7 @@ You can use the get index API operation to return information about an index. ```json GET /sample-index ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -30,13 +32,13 @@ All parameters are optional. Parameter | Type | Description :--- | :--- | :--- -allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indices. Default is true. -expand_wildcards | String | Expands wildcard expressions to different indices. Combine multiple values with commas. Available values are all (match all indices), open (match open indices), closed (match closed indices), hidden (match hidden indices), and none (do not accept wildcard expressions), which must be used with open, closed, or both. Default is open. +allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indexes. Default is true. +expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are all (match all indexes), open (match open indexes), closed (match closed indexes), hidden (match hidden indexes), and none (do not accept wildcard expressions), which must be used with open, closed, or both. Default is open. flat_settings | Boolean | Whether to return settings in the flat form, which can improve readability, especially for heavily nested settings. For example, the flat form of "index": { "creation_date": "123456789" } is "index.creation_date": "123456789". include_defaults | Boolean | Whether to include default settings as part of the response. This parameter is useful for identifying the names and current values of settings you want to update. -ignore_unavailable | Boolean | If true, OpenSearch does not include missing or closed indices in the response. -local | Boolean | Whether to return information from only the local node instead of from the master node. Default is false. -master_timeout | Time | How long to wait for a connection to the master node. Default is `30s`. +ignore_unavailable | Boolean | If true, OpenSearch does not include missing or closed indexes in the response. +local | Boolean | Whether to return information from only the local node instead of from the cluster manager node. Default is false. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. ## Response diff --git a/_api-reference/index-apis/get-settings.md b/_api-reference/index-apis/get-settings.md new file mode 100644 index 00000000..37ac291a --- /dev/null +++ b/_api-reference/index-apis/get-settings.md @@ -0,0 +1,67 @@ +--- +layout: default +title: Get settings +parent: Index APIs +nav_order: 45 +redirect_from: + - /opensearch/rest-api/index-apis/get-index/ +--- + +# Get settings +**Introduced 1.0** +{: .label .label-purple } + +The get settings API operation returns all the settings in your index. + +## Example + +```json +GET /sample-index1/_settings +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +GET /_settings +GET //_settings +GET //_settings/ +``` + +## URL parameters + +All get settings parameters are optional. + +Parameter | Data type | Description +:--- | :--- | :--- +<target-index> | String | The index to get settings from. Can be a comma-separated list to get settings from multiple indexes, or use `_all` to return settings from all indexes within the cluster. +<setting> | String | Filter to return specific settings. +allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. +expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are `all` (match all indexes), `open` (match open indexes), `closed` (match closed indexes), `hidden` (match hidden indexes), and `none` (do not accept wildcard expressions), which must be used with `open`, `closed`, or both. Default is `open`. +flat_settings | Boolean | Whether to return settings in the flat form, which can improve readability, especially for heavily nested settings. For example, the flat form of “index”: { “creation_date”: “123456789” } is “index.creation_date”: “123456789”. +include_defaults | String | Whether to include default settings, including settings used within OpenSearch plugins, in the response. Default is false. +ignore_unavailable | Boolean | If true, OpenSearch does not include missing or closed indexes in the response. +local | Boolean | Whether to return information from the local node only instead of the cluster manager node. Default is false. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. + +## Response + +```json +{ + "sample-index1": { + "settings": { + "index": { + "creation_date": "1622672553417", + "number_of_shards": "1", + "number_of_replicas": "1", + "uuid": "GMEA0_TkSaamrnJSzNLzwg", + "version": { + "created": "135217827", + "upgraded": "135238227" + }, + "provided_name": "sample-index1" + } + } + } +} +``` diff --git a/_opensearch/rest-api/index-apis/index.md b/_api-reference/index-apis/index.md similarity index 58% rename from _opensearch/rest-api/index-apis/index.md rename to _api-reference/index-apis/index.md index c105615a..4c059b3e 100644 --- a/_opensearch/rest-api/index-apis/index.md +++ b/_api-reference/index-apis/index.md @@ -1,16 +1,17 @@ --- layout: default title: Index APIs -parent: REST API reference has_children: true -nav_order: 3 +nav_order: 35 redirect_from: - /opensearch/rest-api/index-apis/ --- # Index APIs +**Introduced 1.0** +{: .label .label-purple } -The index API operations let you interact with indices in your cluster. Using these operations, you can create, delete, close, and complete other index-related operations. +The index API operations let you interact with indexes in your cluster. Using these operations, you can create, delete, close, and complete other index-related operations. -If you use the security plugin, make sure you have the appropriate permissions. +If you use the Security plugin, make sure you have the appropriate permissions. {: .note } diff --git a/_api-reference/index-apis/open-index.md b/_api-reference/index-apis/open-index.md new file mode 100644 index 00000000..6ca03486 --- /dev/null +++ b/_api-reference/index-apis/open-index.md @@ -0,0 +1,52 @@ +--- +layout: default +title: Open index +parent: Index APIs +nav_order: 55 +redirect_from: + - /opensearch/rest-api/index-apis/open-index/ +--- + +# Open index +**Introduced 1.0** +{: .label .label-purple } + +The open index API operation opens a closed index, letting you add or search for data within the index. + +## Example + +```json +POST /sample-index/_open +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +POST //_open +``` + +## URL parameters + +All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +<index-name> | String | The index to open. Can be a comma-separated list of multiple index names. Use `_all` or * to open all indexes. +allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indexes. Default is true. +expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are all (match all indexes), open (match open indexes), closed (match closed indexes), hidden (match hidden indexes), and none (do not accept wildcard expressions). Default is open. +ignore_unavailable | Boolean | If true, OpenSearch does not search for missing or closed indexes. Default is false. +wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the request. Default is 1 (only the primary shard). Set to all or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the request to succeed. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. +timeout | Time | How long to wait for a response from the cluster. Default is `30s`. +wait_for_completion | Boolean | When set to `false`, the request returns immediately instead of after the operation is finished. To monitor the operation status, use the [Tasks API]({{site.url}}{{site.baseurl}}/api-reference/tasks/) with the task ID returned by the request. Default is `true`. +task_execution_timeout | Time | The explicit task execution timeout. Only useful when wait_for_completion is set to `false`. Default is `1h`. + + +## Response +```json +{ + "acknowledged": true, + "shards_acknowledged": true +} +``` diff --git a/_api-reference/index-apis/put-mapping.md b/_api-reference/index-apis/put-mapping.md new file mode 100644 index 00000000..5f6be9f1 --- /dev/null +++ b/_api-reference/index-apis/put-mapping.md @@ -0,0 +1,113 @@ +--- +layout: default +title: Create or update mappings +parent: Index APIs +nav_order: 27 +redirect_from: + - /opensearch/rest-api/index-apis/update-mapping/ + - /opensearch/rest-api/update-mapping/ +--- + +# Create or update mappings +**Introduced 1.0** +{: .label .label-purple } + +If you want to create or add mappings and fields to an index, you can use the put mapping API operation. For an existing mapping, this operation updates the mapping. + +You can't use this operation to update mappings that already map to existing data in the index. You must first create a new index with your desired mappings, and then use the [reindex API operation]({{site.url}}{{site.baseurl}}/opensearch/reindex-data) to map all the documents from your old index to the new index. If you don't want any downtime while you re-index your indexes, you can use [aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias). + + +## Required path parameter + +The only required path parameter is the index with which to associate the mapping. If you don't specify an index, you will get an error. You can specify a single index, or multiple indexes separated by a comma as follows: + +``` +PUT //_mapping +PUT /,/_mapping +``` + +## Required request body field + +The request body must contain `properties`, which has all of the mappings that you want to create or update. + +```json +{ + "properties":{ + "color":{ + "type": "text" + }, + "year":{ + "type": "integer" + } + } +} +``` + +## Optional request body fields + +### dynamic + +You can make the document structure match the structure of the index mapping by setting the `dynamic` request body field to `strict`, as seen in the following example: + +```json +{ + "dynamic": "strict", + "properties":{ + "color":{ + "type": "text" + } + } +} +``` + +## Optional query parameters + +Optionally, you can add query parameters to make a more specific request. For example, to skip any missing or closed indexes in the response, you can add the `ignore_unavailable` query parameter to your request as follows: + +```json +PUT /sample-index/_mapping?ignore_unavailable +``` + +The following table defines the put mapping query parameters: + +Parameter | Data type | Description +:--- | :--- | :--- +allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. +expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are `all` (match all indexes), `open` (match open indexes), `closed` (match closed indexes), `hidden` (match hidden indexes), and `none` (do not accept wildcard expressions), which must be used with `open`, `closed`, or both. Default is `open`. +ignore_unavailable | Boolean | If true, OpenSearch does not include missing or closed indexes in the response. +ignore_malformed | Boolean | Use this parameter with the `ip_range` data type to specify that OpenSearch should ignore malformed fields. If `true`, OpenSearch does not include entries that do not match the IP range specified in the index in the response. The default is `false`. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. +timeout | Time | How long to wait for the response to return. Default is `30s`. +write_index_only | Boolean | Whether OpenSearch should apply mapping updates only to the write index. + +#### Sample Request + +The following request creates a new mapping for the `sample-index` index: + +```json +PUT /sample-index/_mapping + +{ + "properties": { + "age": { + "type": "integer" + }, + "occupation":{ + "type": "text" + } + } +} +``` +{% include copy-curl.html %} + +#### Sample Response + +Upon success, the response returns `"acknowledged": true`. + +```json +{ + "acknowledged": true +} +``` + + diff --git a/_api-reference/index-apis/shrink-index.md b/_api-reference/index-apis/shrink-index.md new file mode 100644 index 00000000..17b7c4df --- /dev/null +++ b/_api-reference/index-apis/shrink-index.md @@ -0,0 +1,87 @@ +--- +layout: default +title: Shrink index +parent: Index APIs +nav_order: 65 +redirect_from: + - /opensearch/rest-api/index-apis/shrink-index/ +--- + +# Shrink index +**Introduced 1.0** +{: .label .label-purple } + +The shrink index API operation moves all of your data in an existing index into a new index with fewer primary shards. + +## Example + +```json +POST /my-old-index/_shrink/my-new-index +{ + "settings": { + "index.number_of_replicas": 4, + "index.number_of_shards": 3 + }, + "aliases":{ + "new-index-alias": {} + } +} +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +POST //_shrink/ +PUT //_shrink/ +``` + +When creating new indexes with this operation, remember that OpenSearch indexes have the following naming restrictions: + +- All letters must be lowercase. +- Index names can't begin with underscores (`_`) or hyphens (`-`). +- Index names can't contain spaces, commas, or the following characters: + + `:`, `"`, `*`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, or `<` + +## URL parameters + +The shrink index API operation requires you to specify both the source index and the target index. All other parameters are optional. + +Parameter | Type | description +:--- | :--- | :--- +<index-name> | String | The index to shrink. +<target-index> | String | The target index to shrink the source index into. +wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the request. Default is 1 (only the primary shard). Set to all or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the request to succeed. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. +timeout | Time | How long to wait for the request to return a response. Default is `30s`. +wait_for_completion | Boolean | When set to `false`, the request returns immediately instead of after the operation is finished. To monitor the operation status, use the [Tasks API]({{site.url}}{{site.baseurl}}/api-reference/tasks/) with the task ID returned by the request. Default is `true`. +task_execution_timeout | Time | The explicit task execution timeout. Only useful when wait_for_completion is set to `false`. Default is `1h`. + +## Request body + +You can use the request body to configure some index settings for the target index. All fields are optional. + +Field | Type | Description +:--- | :--- | :--- +alias | Object | Sets an alias for the target index. Can have the fields `filter`, `index_routing`, `is_hidden`, `is_write_index`, `routing`, or `search_routing`. See [Index Aliases]({{site.url}}{{site.baseurl}}/api-reference/alias/#request-body). +settings | Object | Index settings you can apply to your target index. See [Index Settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/). +[max_shard_size](#the-max_shard_size-parameter) | Bytes | Specifies the maximum size of a primary shard in the target index. Because `max_shard_size` conflicts with the `index.number_of_shards` setting, you cannot set both of them at the same time. + +### The `max_shard_size` parameter + +The `max_shard_size` parameter specifies the maximum size of a primary shard in the target index. OpenSearch uses `max_shard_size` and the total storage for all primary shards in the source index to calculate the number of primary shards and their size for the target index. + +The primary shard count of the target index is the smallest factor of the source index's primary shard count for which the shard size does not exceed `max_shard_size`. For example, if the source index has 8 primary shards, they occupy a total of 400 GB of storage, and the `max_shard_size` is equal to 150 GB, OpenSearch calculates the number of primary shards in the target index using the following algorithm: + +1. Calculate the minimum number of primary shards as 400/150, rounded to the nearest whole integer. The minimum number of primary shards is 3. +1. Calculate the number of primary shards as the smallest factor of 8 that is greater than 3. The number of primary shards is 4. + +The maximum number of primary shards for the target index is equal to the number of primary shards in the source index because the shrink operation is used to reduce the primary shard count. As an example, consider a source index with 5 primary shards that occupy a total of 600 GB of storage. If `max_shard_size` is 100 GB, the minimum number of primary shards is 600/100, which is 6. However, because the number of primary shards in the source index is smaller than 6, the number of primary shards in the target index is set to 5. + +The minimum number of primary shards for the target index is 1. +{: .note} + +## Index codec considerations + +For index codec considerations, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#splits-and-shrinks). \ No newline at end of file diff --git a/_api-reference/index-apis/split.md b/_api-reference/index-apis/split.md new file mode 100644 index 00000000..03b2f742 --- /dev/null +++ b/_api-reference/index-apis/split.md @@ -0,0 +1,81 @@ +--- +layout: default +title: Split index +parent: Index APIs +nav_order: 70 +redirect_from: + - /opensearch/rest-api/index-apis/split/ +--- + +# Split index +**Introduced 1.0** +{: .label .label-purple } + +The split index API operation splits an existing read-only index into a new index, cutting each primary shard into some amount of primary shards in the new index. + +## Example + +```json +PUT /sample-index1/_split/split-index1 +{ + "settings": { + "index": { + "number_of_shards": 4, + "number_of_replicas": 2 + } + }, + "aliases": { + "sample-alias1": {} + } +} +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +POST //_split/ +PUT //_split/ +``` + +## Index naming restrictions + +OpenSearch indexes have the following naming restrictions: + +- All letters must be lowercase. +- Index names can't begin with underscores (`_`) or hyphens (`-`). +- Index names can't contain spaces, commas, or the following characters: + + `:`, `"`, `*`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, or `<` + +## URL parameters + +Your request must include the source and target indexes. All split index parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +<source-index> | String | The source index to split. +<target-index> | String | The index to create. +wait_for_active_shards | String | The number of active shards that must be available before OpenSearch processes the request. Default is 1 (only the primary shard). Set to all or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the operation to succeed. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. +timeout | Time | How long to wait for the request to return. Default is `30s`. +wait_for_completion | Boolean | When set to `false`, the request returns immediately instead of after the operation is finished. To monitor the operation status, use the [Tasks API]({{site.url}}{{site.baseurl}}/api-reference/tasks/) with the task ID returned by the request. Default is `true`. +task_execution_timeout | Time | The explicit task execution timeout. Only useful when wait_for_completion is set to `false`. Default is `1h`. + +## Request body + +The split index API operation creates a new target index, so you can specify any [index settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/) and [aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias/) to apply to the target index. + +## Response + +```json +{ + "acknowledged": true, + "shards_acknowledged": true, + "index": "split-index1" +} +``` + +## Index codec considerations + +For index codec considerations, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#splits-and-shrinks). \ No newline at end of file diff --git a/_api-reference/index-apis/stats.md b/_api-reference/index-apis/stats.md new file mode 100644 index 00000000..d1338276 --- /dev/null +++ b/_api-reference/index-apis/stats.md @@ -0,0 +1,813 @@ +--- +layout: default +title: Stats +parent: Index APIs +nav_order: 72 +--- + +# Index Stats +**Introduced 1.0** +{: .label .label-purple } + +The Index Stats API provides index statistics. For data streams, the API provides statistics for the stream's backing indexes. By default, the returned statistics are index level. To receive shard-level statistics, set the `level` parameter to `shards`. + +When a shard moves to a different node, the shard-level statistics for the shard are cleared. Although the shard is no longer part of the node, the node preserves any node-level statistics to which the shard contributed. +{: .note} + +## Path and HTTP methods + +```json +GET /_stats +GET //_stats +GET //_stats/ +``` + +## Path parameters + +The following table lists the available path parameters. All path parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `` | String | A comma-separated list of indexes, data streams, or index aliases used to filter results. Supports wildcard expressions. Defaults to `_all` (`*`). +`` | String | A comma-separated list of metric groups that will be included in the response. For valid values, see [Metrics](#metrics). Defaults to all metrics. | + +### Metrics + +The following table lists all available metric groups. + +Metric | Description +:--- |:---- +`_all` | Return all statistics. +`completion` | Completion suggester statistics. +`docs` | Returns the number of documents and the number of deleted documents that have not yet been merged. Index refresh operations can affect this statistic. +`fielddata` | Field data statistics. +`flush` | Flush statistics. +`get` | Get statistics, including missing stats. +`indexing` | Indexing statistics. +`merge` | Merge statistics. +`query_cache` | Query cache statistics. +`refresh` | Refresh statistics. +`request_cache` | Shard request cache statistics. +`search` | Search statistics, including suggest operation statistics. Search operations can be associated with one or more groups. You can include statistics for custom groups by providing a `groups` parameter, which accepts a comma-separated list of group names. To return statistics for all groups, use `_all`. +`segments` | Statistics about memory use of all open segments. If the `include_segment_file_sizes` parameter is `true`, this metric includes the aggregated disk usage of each Lucene index file. +`store` | Size of the index in byte units. +`translog` | Translog statistics. +`warmer` | Warmer statistics. + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +Parameter | Data type | Description +:--- | :--- | :--- +`expand_wildcards` | String | Specifies the type of indexes to which wildcard expressions can expand. Supports comma-separated values. Valid values are:
- `all`: Expand to all open and closed indexes, including hidden indexes.
- `open`: Expand to open indexes.
- `closed`: Expand to closed indexes.
- `hidden`: Include hidden indexes when expanding. Must be combined with `open`, `closed`, or both.
- `none`: Do not accept wildcard expressions.
Default is `open`. +`fields` | String | A comma-separated list or a wildcard expression specifying fields to include in the statistics. Specifies the default field list if neither `completion_fields` nor `fielddata_fields` is provided. +`completion_fields` | String | A comma-separated list or wildcard expression specifying fields to include in field-level `completion` statistics. +`fielddata_fields` | String | A comma-separated list or wildcard expression specifying fields to include in field-level `fielddata` statistics. +`forbid_closed_indices` | Boolean | Specifies not to collect statistics for closed indexes. Default is `true`. +`groups` | String | A comma-separated list of search groups to include in the `search` statistics. +`level` | String | Specifies the level used to aggregate statistics. Valid values are:
- `cluster`: Cluster-level statistics.
- `indices`: Index-level statistics.
- `shards`: Shard-level statistics.
Default is `indices`. +`include_segment_file_sizes` | Boolean | Specifies whether to report the aggregated disk usage of each Lucene index file. Only applies to `segments` statistics. Default is `false`. +`include_unloaded_segments` | Boolean | Specifies whether to include information from segments that are not loaded into memory. Default is `false`. + +#### Example request: One index + +```json +GET /testindex/_stats +``` +{% include copy-curl.html %} + +#### Example request: Comma-separated list of indexes + +```json +GET /testindex1,testindex2/_stats +``` +{% include copy-curl.html %} + +#### Example request: Wildcard expression + +```json +GET /testindex*/_stats +``` +{% include copy-curl.html %} + +#### Example request: Specific stats + +```json +GET /testindex/_stats/refresh,flush +``` +{% include copy-curl.html %} + +#### Example request: Expand wildcards + +```json +GET /testindex*/_stats?expand_wildcards=open,hidden +``` +{% include copy-curl.html %} + +#### Example request: Shard-level statistics + +```json +GET /testindex/_stats?level=shards +``` +{% include copy-curl.html %} + +#### Example response + +By default, the returned statistics are aggregated in the `primaries` and `total` aggregations. The `primaries` aggregation contains statistics for the primary shards. The `total` aggregation contains statistics for both primary and replica shards. The following is an example Index Stats API response: + +
+ + Response + + {: .text-delta} + +```json +{ + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_all": { + "primaries": { + "docs": { + "count": 4, + "deleted": 0 + }, + "store": { + "size_in_bytes": 15531, + "reserved_in_bytes": 0 + }, + "indexing": { + "index_total": 4, + "index_time_in_millis": 10, + "index_current": 0, + "index_failed": 0, + "delete_total": 0, + "delete_time_in_millis": 0, + "delete_current": 0, + "noop_update_total": 0, + "is_throttled": false, + "throttle_time_in_millis": 0 + }, + "get": { + "total": 0, + "time_in_millis": 0, + "exists_total": 0, + "exists_time_in_millis": 0, + "missing_total": 0, + "missing_time_in_millis": 0, + "current": 0 + }, + "search": { + "open_contexts": 0, + "query_total": 12, + "query_time_in_millis": 11, + "query_current": 0, + "fetch_total": 12, + "fetch_time_in_millis": 5, + "fetch_current": 0, + "scroll_total": 0, + "scroll_time_in_millis": 0, + "scroll_current": 0, + "point_in_time_total": 0, + "point_in_time_time_in_millis": 0, + "point_in_time_current": 0, + "suggest_total": 0, + "suggest_time_in_millis": 0, + "suggest_current": 0 + }, + "merges": { + "current": 0, + "current_docs": 0, + "current_size_in_bytes": 0, + "total": 0, + "total_time_in_millis": 0, + "total_docs": 0, + "total_size_in_bytes": 0, + "total_stopped_time_in_millis": 0, + "total_throttled_time_in_millis": 0, + "total_auto_throttle_in_bytes": 20971520 + }, + "refresh": { + "total": 8, + "total_time_in_millis": 58, + "external_total": 7, + "external_total_time_in_millis": 60, + "listeners": 0 + }, + "flush": { + "total": 1, + "periodic": 1, + "total_time_in_millis": 21 + }, + "warmer": { + "current": 0, + "total": 6, + "total_time_in_millis": 0 + }, + "query_cache": { + "memory_size_in_bytes": 0, + "total_count": 0, + "hit_count": 0, + "miss_count": 0, + "cache_size": 0, + "cache_count": 0, + "evictions": 0 + }, + "fielddata": { + "memory_size_in_bytes": 0, + "evictions": 0 + }, + "completion": { + "size_in_bytes": 0 + }, + "segments": { + "count": 4, + "memory_in_bytes": 0, + "terms_memory_in_bytes": 0, + "stored_fields_memory_in_bytes": 0, + "term_vectors_memory_in_bytes": 0, + "norms_memory_in_bytes": 0, + "points_memory_in_bytes": 0, + "doc_values_memory_in_bytes": 0, + "index_writer_memory_in_bytes": 0, + "version_map_memory_in_bytes": 0, + "fixed_bit_set_memory_in_bytes": 0, + "max_unsafe_auto_id_timestamp": -1, + "remote_store" : { + "upload" : { + "total_upload_size" : { + "started_bytes" : 152419, + "succeeded_bytes" : 152419, + "failed_bytes" : 0 + }, + "refresh_size_lag" : { + "total_bytes" : 0, + "max_bytes" : 0 + }, + "max_refresh_time_lag_in_millis" : 0, + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } + }, + "download" : { + "total_download_size" : { + "started_bytes" : 0, + "succeeded_bytes" : 0, + "failed_bytes" : 0 + }, + "total_time_spent_in_millis" : 0 + } + }, + "file_sizes": {} + }, + "translog": { + "operations": 0, + "size_in_bytes": 55, + "uncommitted_operations": 0, + "uncommitted_size_in_bytes": 55, + "earliest_last_modified_age": 142622215, + "remote_store" : { + "upload" : { + "total_uploads" : { + "started" : 57, + "failed" : 0, + "succeeded" : 57 + }, + "total_upload_size" : { + "started_bytes" : 16830, + "failed_bytes" : 0, + "succeeded_bytes" : 16830 + } + } + } + }, + "request_cache": { + "memory_size_in_bytes": 0, + "evictions": 0, + "hit_count": 0, + "miss_count": 0 + }, + "recovery": { + "current_as_source": 0, + "current_as_target": 0, + "throttle_time_in_millis": 0 + } + }, + "total": { + "docs": { + "count": 4, + "deleted": 0 + }, + "store": { + "size_in_bytes": 15531, + "reserved_in_bytes": 0 + }, + "indexing": { + "index_total": 4, + "index_time_in_millis": 10, + "index_current": 0, + "index_failed": 0, + "delete_total": 0, + "delete_time_in_millis": 0, + "delete_current": 0, + "noop_update_total": 0, + "is_throttled": false, + "throttle_time_in_millis": 0 + }, + "get": { + "total": 0, + "time_in_millis": 0, + "exists_total": 0, + "exists_time_in_millis": 0, + "missing_total": 0, + "missing_time_in_millis": 0, + "current": 0 + }, + "search": { + "open_contexts": 0, + "query_total": 12, + "query_time_in_millis": 11, + "query_current": 0, + "fetch_total": 12, + "fetch_time_in_millis": 5, + "fetch_current": 0, + "scroll_total": 0, + "scroll_time_in_millis": 0, + "scroll_current": 0, + "point_in_time_total": 0, + "point_in_time_time_in_millis": 0, + "point_in_time_current": 0, + "suggest_total": 0, + "suggest_time_in_millis": 0, + "suggest_current": 0 + }, + "merges": { + "current": 0, + "current_docs": 0, + "current_size_in_bytes": 0, + "total": 0, + "total_time_in_millis": 0, + "total_docs": 0, + "total_size_in_bytes": 0, + "total_stopped_time_in_millis": 0, + "total_throttled_time_in_millis": 0, + "total_auto_throttle_in_bytes": 20971520 + }, + "refresh": { + "total": 8, + "total_time_in_millis": 58, + "external_total": 7, + "external_total_time_in_millis": 60, + "listeners": 0 + }, + "flush": { + "total": 1, + "periodic": 1, + "total_time_in_millis": 21 + }, + "warmer": { + "current": 0, + "total": 6, + "total_time_in_millis": 0 + }, + "query_cache": { + "memory_size_in_bytes": 0, + "total_count": 0, + "hit_count": 0, + "miss_count": 0, + "cache_size": 0, + "cache_count": 0, + "evictions": 0 + }, + "fielddata": { + "memory_size_in_bytes": 0, + "evictions": 0 + }, + "completion": { + "size_in_bytes": 0 + }, + "segments": { + "count": 4, + "memory_in_bytes": 0, + "terms_memory_in_bytes": 0, + "stored_fields_memory_in_bytes": 0, + "term_vectors_memory_in_bytes": 0, + "norms_memory_in_bytes": 0, + "points_memory_in_bytes": 0, + "doc_values_memory_in_bytes": 0, + "index_writer_memory_in_bytes": 0, + "version_map_memory_in_bytes": 0, + "fixed_bit_set_memory_in_bytes": 0, + "max_unsafe_auto_id_timestamp": -1, + "remote_store" : { + "upload" : { + "total_upload_size" : { + "started_bytes" : 152419, + "succeeded_bytes" : 152419, + "failed_bytes" : 0 + }, + "refresh_size_lag" : { + "total_bytes" : 0, + "max_bytes" : 0 + }, + "max_refresh_time_lag_in_millis" : 0, + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } + }, + "download" : { + "total_download_size" : { + "started_bytes" : 0, + "succeeded_bytes" : 0, + "failed_bytes" : 0 + }, + "total_time_spent_in_millis" : 0 + } + }, + "file_sizes": {} + }, + "translog": { + "operations": 0, + "size_in_bytes": 55, + "uncommitted_operations": 0, + "uncommitted_size_in_bytes": 55, + "earliest_last_modified_age": 142622215, + "remote_store" : { + "upload" : { + "total_uploads" : { + "started" : 57, + "failed" : 0, + "succeeded" : 57 + }, + "total_upload_size" : { + "started_bytes" : 16830, + "failed_bytes" : 0, + "succeeded_bytes" : 16830 + } + } + } + }, + "request_cache": { + "memory_size_in_bytes": 0, + "evictions": 0, + "hit_count": 0, + "miss_count": 0 + }, + "recovery": { + "current_as_source": 0, + "current_as_target": 0, + "throttle_time_in_millis": 0 + } + } + }, + "indices": { + "testindex": { + "uuid": "0SXXSpe9Rp-FpxXXWLOD8Q", + "primaries": { + "docs": { + "count": 4, + "deleted": 0 + }, + "store": { + "size_in_bytes": 15531, + "reserved_in_bytes": 0 + }, + "indexing": { + "index_total": 4, + "index_time_in_millis": 10, + "index_current": 0, + "index_failed": 0, + "delete_total": 0, + "delete_time_in_millis": 0, + "delete_current": 0, + "noop_update_total": 0, + "is_throttled": false, + "throttle_time_in_millis": 0 + }, + "get": { + "total": 0, + "time_in_millis": 0, + "exists_total": 0, + "exists_time_in_millis": 0, + "missing_total": 0, + "missing_time_in_millis": 0, + "current": 0 + }, + "search": { + "open_contexts": 0, + "query_total": 12, + "query_time_in_millis": 11, + "query_current": 0, + "fetch_total": 12, + "fetch_time_in_millis": 5, + "fetch_current": 0, + "scroll_total": 0, + "scroll_time_in_millis": 0, + "scroll_current": 0, + "point_in_time_total": 0, + "point_in_time_time_in_millis": 0, + "point_in_time_current": 0, + "suggest_total": 0, + "suggest_time_in_millis": 0, + "suggest_current": 0 + }, + "merges": { + "current": 0, + "current_docs": 0, + "current_size_in_bytes": 0, + "total": 0, + "total_time_in_millis": 0, + "total_docs": 0, + "total_size_in_bytes": 0, + "total_stopped_time_in_millis": 0, + "total_throttled_time_in_millis": 0, + "total_auto_throttle_in_bytes": 20971520 + }, + "refresh": { + "total": 8, + "total_time_in_millis": 58, + "external_total": 7, + "external_total_time_in_millis": 60, + "listeners": 0 + }, + "flush": { + "total": 1, + "periodic": 1, + "total_time_in_millis": 21 + }, + "warmer": { + "current": 0, + "total": 6, + "total_time_in_millis": 0 + }, + "query_cache": { + "memory_size_in_bytes": 0, + "total_count": 0, + "hit_count": 0, + "miss_count": 0, + "cache_size": 0, + "cache_count": 0, + "evictions": 0 + }, + "fielddata": { + "memory_size_in_bytes": 0, + "evictions": 0 + }, + "completion": { + "size_in_bytes": 0 + }, + "segments": { + "count": 4, + "memory_in_bytes": 0, + "terms_memory_in_bytes": 0, + "stored_fields_memory_in_bytes": 0, + "term_vectors_memory_in_bytes": 0, + "norms_memory_in_bytes": 0, + "points_memory_in_bytes": 0, + "doc_values_memory_in_bytes": 0, + "index_writer_memory_in_bytes": 0, + "version_map_memory_in_bytes": 0, + "fixed_bit_set_memory_in_bytes": 0, + "max_unsafe_auto_id_timestamp": -1, + "remote_store" : { + "upload" : { + "total_upload_size" : { + "started_bytes" : 152419, + "succeeded_bytes" : 152419, + "failed_bytes" : 0 + }, + "refresh_size_lag" : { + "total_bytes" : 0, + "max_bytes" : 0 + }, + "max_refresh_time_lag_in_millis" : 0, + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } + }, + "download" : { + "total_download_size" : { + "started_bytes" : 0, + "succeeded_bytes" : 0, + "failed_bytes" : 0 + }, + "total_time_spent_in_millis" : 0 + } + }, + "file_sizes": {} + }, + "translog": { + "operations": 0, + "size_in_bytes": 55, + "uncommitted_operations": 0, + "uncommitted_size_in_bytes": 55, + "earliest_last_modified_age": 142622215, + "remote_store" : { + "upload" : { + "total_uploads" : { + "started" : 57, + "failed" : 0, + "succeeded" : 57 + }, + "total_upload_size" : { + "started_bytes" : 16830, + "failed_bytes" : 0, + "succeeded_bytes" : 16830 + } + } + } + }, + "request_cache": { + "memory_size_in_bytes": 0, + "evictions": 0, + "hit_count": 0, + "miss_count": 0 + }, + "recovery": { + "current_as_source": 0, + "current_as_target": 0, + "throttle_time_in_millis": 0 + } + }, + "total": { + "docs": { + "count": 4, + "deleted": 0 + }, + "store": { + "size_in_bytes": 15531, + "reserved_in_bytes": 0 + }, + "indexing": { + "index_total": 4, + "index_time_in_millis": 10, + "index_current": 0, + "index_failed": 0, + "delete_total": 0, + "delete_time_in_millis": 0, + "delete_current": 0, + "noop_update_total": 0, + "is_throttled": false, + "throttle_time_in_millis": 0 + }, + "get": { + "total": 0, + "time_in_millis": 0, + "exists_total": 0, + "exists_time_in_millis": 0, + "missing_total": 0, + "missing_time_in_millis": 0, + "current": 0 + }, + "search": { + "open_contexts": 0, + "query_total": 12, + "query_time_in_millis": 11, + "query_current": 0, + "fetch_total": 12, + "fetch_time_in_millis": 5, + "fetch_current": 0, + "scroll_total": 0, + "scroll_time_in_millis": 0, + "scroll_current": 0, + "point_in_time_total": 0, + "point_in_time_time_in_millis": 0, + "point_in_time_current": 0, + "suggest_total": 0, + "suggest_time_in_millis": 0, + "suggest_current": 0 + }, + "merges": { + "current": 0, + "current_docs": 0, + "current_size_in_bytes": 0, + "total": 0, + "total_time_in_millis": 0, + "total_docs": 0, + "total_size_in_bytes": 0, + "total_stopped_time_in_millis": 0, + "total_throttled_time_in_millis": 0, + "total_auto_throttle_in_bytes": 20971520 + }, + "refresh": { + "total": 8, + "total_time_in_millis": 58, + "external_total": 7, + "external_total_time_in_millis": 60, + "listeners": 0 + }, + "flush": { + "total": 1, + "periodic": 1, + "total_time_in_millis": 21 + }, + "warmer": { + "current": 0, + "total": 6, + "total_time_in_millis": 0 + }, + "query_cache": { + "memory_size_in_bytes": 0, + "total_count": 0, + "hit_count": 0, + "miss_count": 0, + "cache_size": 0, + "cache_count": 0, + "evictions": 0 + }, + "fielddata": { + "memory_size_in_bytes": 0, + "evictions": 0 + }, + "completion": { + "size_in_bytes": 0 + }, + "segments": { + "count": 4, + "memory_in_bytes": 0, + "terms_memory_in_bytes": 0, + "stored_fields_memory_in_bytes": 0, + "term_vectors_memory_in_bytes": 0, + "norms_memory_in_bytes": 0, + "points_memory_in_bytes": 0, + "doc_values_memory_in_bytes": 0, + "index_writer_memory_in_bytes": 0, + "version_map_memory_in_bytes": 0, + "fixed_bit_set_memory_in_bytes": 0, + "max_unsafe_auto_id_timestamp": -1, + "remote_store" : { + "upload" : { + "total_upload_size" : { + "started_bytes" : 152419, + "succeeded_bytes" : 152419, + "failed_bytes" : 0 + }, + "refresh_size_lag" : { + "total_bytes" : 0, + "max_bytes" : 0 + }, + "max_refresh_time_lag_in_millis" : 0, + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } + }, + "download" : { + "total_download_size" : { + "started_bytes" : 0, + "succeeded_bytes" : 0, + "failed_bytes" : 0 + }, + "total_time_spent_in_millis" : 0 + } + }, + "file_sizes": {} + }, + "translog": { + "operations": 0, + "size_in_bytes": 55, + "uncommitted_operations": 0, + "uncommitted_size_in_bytes": 55, + "earliest_last_modified_age": 142622215, + "remote_store" : { + "upload" : { + "total_uploads" : { + "started" : 57, + "failed" : 0, + "succeeded" : 57 + }, + "total_upload_size" : { + "started_bytes" : 16830, + "failed_bytes" : 0, + "succeeded_bytes" : 16830 + } + } + } + }, + "request_cache": { + "memory_size_in_bytes": 0, + "evictions": 0, + "hit_count": 0, + "miss_count": 0 + }, + "recovery": { + "current_as_source": 0, + "current_as_target": 0, + "throttle_time_in_millis": 0 + } + } + } + } +} +``` +
+ +## Response fields + +For information about response fields, see [Nodes Stats API response fields]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-stats/#indices). diff --git a/_api-reference/index-apis/update-settings.md b/_api-reference/index-apis/update-settings.md new file mode 100644 index 00000000..3f38418e --- /dev/null +++ b/_api-reference/index-apis/update-settings.md @@ -0,0 +1,68 @@ +--- +layout: default +title: Update settings +parent: Index APIs +nav_order: 75 +redirect_from: + - /opensearch/rest-api/index-apis/update-settings/ +--- + +# Update settings +**Introduced 1.0** +{: .label .label-purple } + +You can use the update settings API operation to update index-level settings. You can change dynamic index settings at any time, but static settings cannot be changed after index creation. For more information about static and dynamic index settings, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). + +Aside from the static and dynamic index settings, you can also update individual plugins' settings. To get the full list of updatable settings, run `GET /_settings?include_defaults=true`. + +## Example + +```json +PUT /sample-index1/_settings +{ + "index.plugins.index_state_management.rollover_skip": true, + "index": { + "number_of_replicas": 4 + } +} +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +``` +PUT //_settings +``` + +## Query parameters + +All update settings parameters are optional. + +Parameter | Data type | Description +:--- | :--- | :--- +allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. +expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are `all` (match all indexes), `open` (match open indexes), `closed` (match closed indexes), `hidden` (match hidden indexes), and `none` (do not accept wildcard expressions), which must be used with `open`, `closed`, or both. Default is `open`. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. +preserve_existing | Boolean | Whether to preserve existing index settings. Default is false. +timeout | Time | How long to wait for a connection to return. Default is `30s`. + +## Request body + +The request body must all of the index settings that you want to update. + +```json +{ + "index.plugins.index_state_management.rollover_skip": true, + "index": { + "number_of_replicas": 4 + } +} +``` + +## Response + +```json +{ + "acknowledged": true +} +``` diff --git a/_api-reference/index.md b/_api-reference/index.md new file mode 100644 index 00000000..41d54989 --- /dev/null +++ b/_api-reference/index.md @@ -0,0 +1,66 @@ +--- +layout: default +title: REST API reference +nav_order: 1 +has_toc: false +has_children: true +nav_exclude: true +permalink: /api-reference/ +redirect_from: + - /opensearch/rest-api/index/ + - /api-reference/index/ +--- + +# REST API reference +**Introduced 1.0** +{: .label .label-purple } + +You can use REST APIs for most operations in OpenSearch. In this reference, we provide a description of the API, and details that include the paths and HTTP methods, supported parameters, and example requests and responses. + +This reference includes the REST APIs supported by OpenSearch. If a REST API is missing, please provide feedback or submit a pull request in GitHub. +{: .tip } + +## Related articles + +- [Analyze API]({{site.url}}{{site.baseurl}}/api-reference/analyze-apis/) +- [Access control API]({{site.url}}{{site.baseurl}}/security/access-control/api/) +- [Alerting API]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/api/) +- [Anomaly detection API]({{site.url}}{{site.baseurl}}/observing-your-data/ad/api/) +- [CAT APIs]({{site.url}}{{site.baseurl}}/api-reference/cat/index/) +- [Cluster APIs]({{site.url}}{{site.baseurl}}/api-reference/cluster-api/index/) +- [Common REST parameters]({{site.url}}{{site.baseurl}}/api-reference/common-parameters/) +- [Count]({{site.url}}{{site.baseurl}}/api-reference/count/) +- [Cross-cluster replication API]({{site.url}}{{site.baseurl}}/tuning-your-cluster/replication-plugin/api/) +- [Document APIs]({{site.url}}{{site.baseurl}}/api-reference/document-apis/index/) +- [Explain]({{site.url}}{{site.baseurl}}/api-reference/explain/) +- [Index APIs]({{site.url}}{{site.baseurl}}/api-reference/index-apis/index/) +- [Index rollups API]({{site.url}}{{site.baseurl}}/im-plugin/index-rollups/rollup-api/) +- [Index state management API]({{site.url}}{{site.baseurl}}/im-plugin/ism/api/) +- [ISM error prevention API]({{site.url}}{{site.baseurl}}/im-plugin/ism/error-prevention/api/) +- [Ingest APIs]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) +- [k-NN plugin API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api/) +- [ML Commons API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/) +- [Multi-search]({{site.url}}{{site.baseurl}}/api-reference/multi-search/) +- [Nodes APIs]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/) +- [Notifications API]({{site.url}}{{site.baseurl}}/observing-your-data/notifications/api/) +- [Performance analyzer API]({{site.url}}{{site.baseurl}}/monitoring-your-cluster/pa/api/) +- [Point in Time API]({{site.url}}{{site.baseurl}}/search-plugins/point-in-time-api/) +- [Popular APIs]({{site.url}}{{site.baseurl}}/api-reference/popular-api/) +- [Ranking evaluation]({{site.url}}{{site.baseurl}}/api-reference/rank-eval/) +- [Refresh search analyzer]({{site.url}}{{site.baseurl}}/im-plugin/refresh-analyzer/) +- [Remove cluster information]({{site.url}}{{site.baseurl}}/api-reference/remote-info/) +- [Root cause analysis API]({{site.url}}{{site.baseurl}}/monitoring-your-cluster/pa/rca/api/) +- [Snapshot management API]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/sm-api/) +- [Script APIs]({{site.url}}{{site.baseurl}}/api-reference/script-apis/index/) +- [Scroll]({{site.url}}{{site.baseurl}}/api-reference/scroll/) +- [Search]({{site.url}}{{site.baseurl}}/api-reference/search/) +- [Search relevance stats API]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/stats-api/) +- [Security analytics APIs]({{site.url}}{{site.baseurl}}/security-analytics/api-tools/index/) +- [Snapshot APIs]({{site.url}}{{site.baseurl}}/api-reference/snapshots/index/) +- [Stats API]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/stats-api/) +- [Supported units]({{site.url}}{{site.baseurl}}/api-reference/units/) +- [Tasks]({{site.url}}{{site.baseurl}}/api-reference/tasks/) +- [Transforms API]({{site.url}}{{site.baseurl}}/im-plugin/index-transforms/transforms-apis/) + + + diff --git a/_api-reference/ingest-apis/index.md b/_api-reference/ingest-apis/index.md new file mode 100644 index 00000000..6cea0a9f --- /dev/null +++ b/_api-reference/ingest-apis/index.md @@ -0,0 +1,23 @@ +--- +layout: default +title: Ingest APIs +has_children: false +nav_order: 40 +redirect_from: + - /opensearch/rest-api/ingest-apis/index/ +--- + +# Ingest APIs +**Introduced 1.0** +{: .label .label-purple } + +Ingest APIs are a valuable tool for loading data into a system. Ingest APIs work together with [ingest pipelines]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/ingest-pipelines/) and [ingest processors]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/ingest-processors/) to process or transform data from a variety of sources and in a variety of formats. + +## Ingest pipeline APIs + +Simplify, secure, and scale your OpenSearch data ingestion with the following APIs: + +- [Create pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/create-ingest/): Use this API to create or update a pipeline configuration. +- [Get pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/get-ingest/): Use this API to retrieve a pipeline configuration. +- [Simulate pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/simulate-ingest/): Use this pipeline to test a pipeline configuration. +- [Delete pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/delete-ingest/): Use this API to delete a pipeline configuration. diff --git a/_opensearch/rest-api/multi-search.md b/_api-reference/multi-search.md similarity index 90% rename from _opensearch/rest-api/multi-search.md rename to _api-reference/multi-search.md index ecdfb26d..25f28c5e 100644 --- a/_opensearch/rest-api/multi-search.md +++ b/_api-reference/multi-search.md @@ -1,17 +1,17 @@ --- layout: default title: Multi-search -parent: REST API reference -nav_order: 130 +nav_order: 45 +redirect_from: + - /opensearch/rest-api/multi-search/ --- # Multi-search -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } As the name suggests, the multi-search operation lets you bundle multiple search requests into a single request. OpenSearch then executes the searches in parallel, so you get back the response more quickly compared to sending one request per search. OpenSearch executes each search independently, so the failure of one doesn't affect the others. - ## Example ```json @@ -22,6 +22,7 @@ GET _msearch { "query": { "match_all": {} } } ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -46,10 +47,10 @@ Query\n ``` -- Metadata lines include options, such as which indices to search and the type of search. +- Metadata lines include options, such as which indexes to search and the type of search. - Query lines use the [query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/). -Just like the [bulk]({{site.url}}{{site.baseurl}}/opensearch/rest-api/document-apis/bulk/) operation, the JSON doesn't need to be minified---spaces are fine---but it does need to be on a single line. OpenSearch uses newline characters to parse multi-search requests and requires that the request body end with a newline character. +Just like the [bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) operation, the JSON doesn't need to be minified---spaces are fine---but it does need to be on a single line. OpenSearch uses newline characters to parse multi-search requests and requires that the request body end with a newline character. ## URL parameters and metadata options @@ -58,10 +59,11 @@ All multi-search URL parameters are optional. Some can also be applied per-searc Parameter | Type | Description | Supported in metadata line :--- | :--- | :--- -allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indices. Default is `true`. | Yes +allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indexes. Default is `true`. | Yes +cancel_after_time_interval | Time | The time after which the search request will be canceled. Supported at both parent and child request levels. The order of precedence is:
1. Child-level parameter
2. Parent-level parameter
3. [Cluster setting]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings).
Default is -1. | Yes css_minimize_roundtrips | Boolean | Whether OpenSearch should try to minimize the number of network round trips between the coordinating node and remote clusters (only applicable to cross-cluster search requests). Default is `true`. | No -expand_wildcards | Enum | Expands wildcard expressions to concrete indices. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. | Yes -ignore_unavailable | Boolean | If an index from the indices list doesn’t exist, whether to ignore it rather than fail the query. Default is `false`. | Yes +expand_wildcards | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. | Yes +ignore_unavailable | Boolean | If an index from the indexes list doesn’t exist, whether to ignore it rather than fail the query. Default is `false`. | Yes max_concurrent_searches | Integer | The maximum number of concurrent searches. The default depends on your node count and search thread pool size. Higher values can improve performance, but risk overloading the cluster. | No max_concurrent_shard_requests | Integer | Maximum number of concurrent shard requests that each search executes per node. Default is 5. Higher values can improve performance, but risk overloading the cluster. | No pre_filter_shard_size | Integer | Default is 128. | No @@ -80,10 +82,10 @@ Some options can't be applied as URL parameters to the entire request. Instead, Option | Type | Description :--- | :--- | :--- -index | String, string array | If you don't specify an index or multiple indices as part of the URL (or want to override the URL value for an individual search), you can include it here. Examples include `"logs-*"` and `["my-store", "sample_data_ecommerce"]`. +index | String, string array | If you don't specify an index or multiple indexes as part of the URL (or want to override the URL value for an individual search), you can include it here. Examples include `"logs-*"` and `["my-store", "sample_data_ecommerce"]`. preference | String | The nodes or shards that you'd like to perform the search. This setting can be useful for testing, but in most situations, the default behavior provides the best search latencies. Options include `_local`, `_only_local`, `_prefer_nodes`, `_only_nodes`, and `_shards`. These last three options accept a list of nodes or shards. Examples include `"_only_nodes:data-node1,data-node2"` and `"_shards:0,1`. -request_cache | Boolean | Whether to cache results, which can improve latency for repeat searches. Default is to use the `index.requests.cache.enable` setting for the index (which defaults to `true` for new indices). -routing | String | Comma-separated custom routing values (e.g. `"routing": "value1,value2,value3"`. +request_cache | Boolean | Whether to cache results, which can improve latency for repeat searches. Default is to use the `index.requests.cache.enable` setting for the index (which defaults to `true` for new indexes). +routing | String | Comma-separated custom routing values, for example, `"routing": "value1,value2,value3"`. ## Response @@ -112,7 +114,6 @@ OpenSearch returns an array with the results of each search in the same order as "hits" : [ { "_index" : "opensearch_dashboards_sample_data_logs", - "_type" : "_doc", "_id" : "_fnhBXsBgv2Zxgu9dZ8Y", "_score" : 1.0, "_source" : { @@ -177,7 +178,6 @@ OpenSearch returns an array with the results of each search in the same order as "hits" : [ { "_index" : "opensearch_dashboards_sample_data_ecommerce", - "_type" : "_doc", "_id" : "efnhBXsBgv2Zxgu9ap7e", "_score" : 1.0, "_source" : { diff --git a/_api-reference/nodes-apis/index.md b/_api-reference/nodes-apis/index.md new file mode 100644 index 00000000..ac5e767e --- /dev/null +++ b/_api-reference/nodes-apis/index.md @@ -0,0 +1,80 @@ +--- +layout: default +title: Nodes APIs +has_children: true +nav_order: 50 +--- + +# Nodes API +**Introduced 1.0** +{: .label .label-purple } + +The nodes API makes it possible to retrieve information about individual nodes within your cluster. + +## Node filters + +Use the `` parameter to filter the target set of nodes in the API response. + + + +Parameter | Type | Description +:--- |:-------| :--- +`` | String | A comma-separated list of resolution mechanisms that OpenSearch uses to identify cluster nodes. + +Node filters support several node resolution mechanisms: + +- Predefined constants: `_local`, `_cluster_manager`, or `_all`. +- An exact match for `nodeID` +- A simple case-sensitive wildcard pattern matching for `node-name`, `host-name`, or `host-IP-address`. +- Node roles where the `` value is set either to `true` or `false`: + - `cluster_manager:` refers to all cluster manager-eligible nodes. + - `data:` refers to all data nodes. + - `ingest:` refers to all ingest nodes. + - `voting_only:` refers to all voting-only nodes. + - `ml:` refers to all machine learning (ML) nodes. + - `coordinating_only:` refers to all coordinating-only nodes. +- A simple case-sensitive wildcard pattern matching for node attributes: `:`. The wildcard matching pattern can be used in both the key and value at the same time. + +Resolution mechanisms are applied sequentially in the order specified by the client. Each mechanism specification can either add or remove nodes. + +To get statistics from the elected cluster manager node only, use the following query : + +```json +GET /_nodes/_cluster_manager/stats +``` +{% include copy-curl.html %} + +To get statistics from nodes that are data-only nodes, use the following query: + +```json +GET /_nodes/data:true/stats +``` +{% include copy-curl.html %} + +### Order of resolution mechanisms + +The order of resolution mechanisms is applied sequentially, and each can add or remove nodes. The following examples yield different results. + +To get statistics from all the nodes except the cluster manager node, use the following query: + +```json +GET /_nodes/_all,cluster_manager:false/stats +``` +{% include copy-curl.html %} + +However, if you switch the resolution mechanisms, the result will include all the cluster nodes, including the cluster manager node: + +```json +GET /_nodes/cluster_manager:false,_all/stats +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_api-reference/nodes-apis/nodes-hot-threads.md b/_api-reference/nodes-apis/nodes-hot-threads.md new file mode 100644 index 00000000..3fb6ff65 --- /dev/null +++ b/_api-reference/nodes-apis/nodes-hot-threads.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Nodes hot threads +parent: Nodes APIs +nav_order: 30 +--- + +# Nodes hot threads +**Introduced 1.0** +{: .label .label-purple } + +The nodes hot threads endpoint provides information about busy JVM threads for selected cluster nodes. It provides a unique view of the of activity each node. + +#### Example + +```json +GET /_nodes/hot_threads +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +```json +GET /_nodes/hot_threads +GET /_nodes//hot_threads +``` + +## Path parameters + +You can include the following optional path parameter in your request. + +Parameter | Type | Description +:--- | :--- | :--- +nodeId | String | A comma-separated list of node IDs used to filter results. Supports [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters). Defaults to `_all`. + +## Query parameters + +You can include the following query parameters in your request. All query parameters are optional. + +Parameter | Type | Description +:--- | :---| :--- +snapshots | Integer | The number of samples of thread stacktraces. Defaults to `10`. +interval | Time | The interval between consecutive samples. Defaults to `500ms`. +threads | Integer | The number of the busiest threads to return information about. Defaults to `3`. +ignore_idle_threads | Boolean | Don’t show threads that are in known idle states, such as waiting on a socket select or pulling from an empty task queue. Defaults to `true`. +type | String | Supported thread types are `cpu`, `wait`, or `block`. Defaults to `cpu`. +timeout | Time | Sets the time limit for node response. Default value is `30s`. + +#### Example request + +```json +GET /_nodes/hot_threads +``` +{% include copy-curl.html %} + +#### Example response + +```bash +::: {opensearch}{F-ByTQzVQ3GQeYzQJArJGQ}{GxbcLdCATPWggOuQHJAoCw}{127.0.0.1}{127.0.0.1:9300}{dimr}{shard_indexing_pressure_enabled=true} + Hot threads at 2022-09-29T19:46:44.533Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true: + + 0.1% (455.5micros out of 500ms) cpu usage by thread 'ScheduledMetricCollectorsExecutor' + 10/10 snapshots sharing following 2 elements + java.base@17.0.4/java.lang.Thread.sleep(Native Method) + org.opensearch.performanceanalyzer.collectors.ScheduledMetricCollectorsExecutor.run(ScheduledMetricCollectorsExecutor.java:100) +``` + +## Response + +Unlike the majority of OpenSearch API responses, this response is in a text format. + +It consists of one section per each cluster node included in the response. + +Each section starts with a single line containing the following segments: + +Line segment | Description +:--- |:------- +:::  | Line start (a distinct visual symbol). +`{global-eu-35}` | Node name. +`{uFPbKLDOTlOmdnwUlKW8sw}` | NodeId. +`{OAM8OT5CQAyasWuIDeVyUA}` | EphemeralId. +`{global-eu-35.local}` | Host name. +`{[gdv2:a284:2acv:5fa6:0:3a2:7260:74cf]:9300}` | Host address. +`{dimr}` | Node roles (d=data, i=ingest, m=cluster manager, r=remote cluster client). +`{zone=west-a2, shard_indexing_pressure_enabled=true}` | Node attributes. + +Then information about threads of the selected type is provided. + +```bash +::: {global-eu-35}{uFPbKLDOTlOmdnwUlKW8sw}{OAM8OT5CQAyasWuIDeVyUA}{global-eu-35.local}{[gdv2:a284:2acv:5fa6:0:3a2:7260:74cf]:9300}{dimr}{zone=west-a2, shard_indexing_pressure_enabled=true} + Hot threads at 2022-04-01T15:15:27.658Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true: + + 0.1% (645micros out of 500ms) cpu usage by thread 'opensearch[global-eu-35][transport_worker][T#7]' + 4/10 snapshots sharing following 3 elements + io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986) + io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) + java.base@11.0.14.1/java.lang.Thread.run(Thread.java:829) +::: {global-eu-62}{4knOxAdERlOB19zLQIT1bQ}{HJuZs2HiQ_-8Elj0Fvi_1g}{global-eu-62.local}{[gdv2:a284:2acv:5fa6:0:3a2:bba6:fe3f]:9300}{dimr}{zone=west-a2, shard_indexing_pressure_enabled=true} + Hot threads at 2022-04-01T15:15:27.659Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true: + + 18.7% (93.4ms out of 500ms) cpu usage by thread 'opensearch[global-eu-62][transport_worker][T#3]' + 6/10 snapshots sharing following 3 elements + io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986) + io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) + java.base@11.0.14.1/java.lang.Thread.run(Thread.java:829) +::: {global-eu-44}{8WW3hrkcTwGvgah_L8D_jw}{Sok7spHISFyol0jFV6i0kw}{global-eu-44.local}{[gdv2:a284:2acv:5fa6:0:3a2:9120:e79e]:9300}{dimr}{zone=west-a2, shard_indexing_pressure_enabled=true} + Hot threads at 2022-04-01T15:15:27.659Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true: + + 42.6% (212.7ms out of 500ms) cpu usage by thread 'opensearch[global-eu-44][write][T#5]' + 2/10 snapshots sharing following 43 elements + java.base@11.0.14.1/sun.nio.ch.IOUtil.write1(Native Method) + java.base@11.0.14.1/sun.nio.ch.EPollSelectorImpl.wakeup(EPollSelectorImpl.java:254) + io.netty.channel.nio.NioEventLoop.wakeup(NioEventLoop.java:787) + io.netty.util.concurrent.SingleThreadEventExecutor.execute(SingleThreadEventExecutor.java:846) + io.netty.util.concurrent.SingleThreadEventExecutor.execute(SingleThreadEventExecutor.java:815) + io.netty.channel.AbstractChannelHandlerContext.safeExecute(AbstractChannelHandlerContext.java:989) + io.netty.channel.AbstractChannelHandlerContext.write(AbstractChannelHandlerContext.java:796) + io.netty.channel.AbstractChannelHandlerContext.writeAndFlush(AbstractChannelHandlerContext.java:758) + io.netty.channel.DefaultChannelPipeline.writeAndFlush(DefaultChannelPipeline.java:1020) + io.netty.channel.AbstractChannel.writeAndFlush(AbstractChannel.java:311) + org.opensearch.transport.netty4.Netty4TcpChannel.sendMessage(Netty4TcpChannel.java:159) + app//org.opensearch.transport.OutboundHan... +``` + +## Required permissions + +If you use the Security plugin, make sure you set the following permissions: `cluster:monitor/nodes/hot_threads`. diff --git a/_api-reference/nodes-apis/nodes-info.md b/_api-reference/nodes-apis/nodes-info.md new file mode 100644 index 00000000..d7c81041 --- /dev/null +++ b/_api-reference/nodes-apis/nodes-info.md @@ -0,0 +1,169 @@ +--- +layout: default +title: Nodes info +parent: Nodes APIs +nav_order: 10 +--- + +# Nodes info +**Introduced 1.0** +{: .label .label-purple } + +The nodes info API represents mostly static information about your cluster's nodes, including but not limited to: + +- Host system information +- JVM +- Processor Type +- Node settings +- Thread pools settings +- Installed plugins + +## Example + +To get information about all nodes in a cluster, use the following query: + +```json +GET /_nodes +``` +{% include copy-curl.html %} + +To get thread pool information about the cluster manager node only, use the following query: + +```json +GET /_nodes/master:true/thread_pool +``` +{% include copy-curl.html %} + +## Path and HTTP methods + +```bash +GET /_nodes +GET /_nodes/ +GET /_nodes/ +GET /_nodes// +# or full path equivalent +GET /_nodes//info/ +``` + +## Path parameters + +The following table lists the available path parameters. All path parameters are optional. + +Parameter | Type | Description +:--- |:-------| :--- +nodeId | String | A comma-separated list of nodeIds used to filter results. Supports [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters). Defaults to `_all`. +metrics | String | A comma-separated list of metric groups that will be included in the response. For example, `jvm,thread_pool`. Defaults to all metrics. + +The following table lists all available metric groups. + +Metric | Description +:--- |:---- +settings | A node's settings. This is a combination of the default settings, custom settings from the [configuration file]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/#configuration-file), and dynamically [updated settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/#updating-cluster-settings-using-the-api). +os | Static information about the host OS, including version, processor architecture, and available/allocated processors. +process | Contains the process ID. +jvm | Detailed static information about the running JVM, including arguments. +thread_pool | Configured options for all individual thread pools. +transport | Mostly static information about the transport layer. +http | Mostly static information about the HTTP layer. +plugins | Information about installed plugins and modules. +ingest | Information about ingest pipelines and available ingest processors. +aggregations | Information about available [aggregations]({{site.url}}{{site.baseurl}}/opensearch/aggregations). +indices | Static index settings configured at the node level. + +## Query parameters + +You can include the following query parameters in your request. All query parameters are optional. + +Parameter | Type | Description +:--- |:-------| :--- +flat_settings| Boolean | Specifies whether to return the `settings` object of the response in flat format. Default is `false`. +timeout | Time | Sets the time limit for node response. Default value is `30s`. + +#### Example request + +The following query requests the `process` and `transport` metrics from the cluster manager node: + +```json +GET /_nodes/cluster_manager:true/process,transport +``` +{% include copy-curl.html %} + +#### Example response + +The response contains the metric groups specified in the `` request parameter (in this case, `process` and `transport`): + +```json +{ + "_nodes": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "cluster_name": "opensearch", + "nodes": { + "VC0d4RgbTM6kLDwuud2XZQ": { + "name": "node-m1-23", + "transport_address": "127.0.0.1:9300", + "host": "127.0.0.1", + "ip": "127.0.0.1", + "version": "1.3.1", + "build_type": "tar", + "build_hash": "c4c0672877bf0f787ca857c7c37b775967f93d81", + "roles": [ + "data", + "ingest", + "master", + "remote_cluster_client" + ], + "attributes": { + "shard_indexing_pressure_enabled": "true" + }, + "process" : { + "refresh_interval_in_millis": 1000, + "id": 44584, + "mlockall": false + }, + "transport": { + "bound_address": [ + "[::1]:9300", + "127.0.0.1:9300" + ], + "publish_address": "127.0.0.1:9300", + "profiles": { } + } + } + } +} +``` + +## Response fields + +The response contains the basic node identification and build info for every node matching the `` request parameter. The following table lists the response fields. + +Field | Description +:--- |:---- +name | The node's name. +transport_address | The node's transport address. +host | The node's host address. +ip | The node's host IP address. +version | The node's OpenSearch version. +build_type | The node's build type, like `rpm`, `docker`, `tar`, etc. +build_hash | The git commit hash of the build. +total_indexing_buffer | The maximum heap size in bytes used to hold newly indexed documents. Once this heap size is exceeded, the documents are written to disk. +roles | The list of the node's roles. +attributes | The node's attributes. +os | Information about the OS, including name, version, architecture, refresh interval, and the number of available and allocated processors. +process | Information about the currently running process, including PID, refresh interval, and `mlockall`, which specifies whether the process address space has been successfully locked in memory. +jvm | Information about the JVM, including PID, version, memory information, garbage collector information, and arguments. +thread_pool | Information about the thread pool. +transport | Information about the transport address, including bound address, publish address, and profiles. +http | Information about the HTTP address, including bound address, publish address, and maximum content length, in bytes. +plugins | Information about the installed plugins, including name, version, OpenSearch version, Java version, description, class name, custom folder name, a list of extended plugins, and `has_native_controller`, which specifies whether the plugin has a native controller process. +modules | Information about the modules, including name, version, OpenSearch version, Java version, description, class name, custom folder name, a list of extended plugins, and `has_native_controller`, which specifies whether the plugin has a native controller process. Modules are different from plugins because modules are loaded into OpenSearch automatically, while plugins have to be installed manually. +ingest | Information about ingest pipelines and processors. +aggregations | Information about the available aggregation types. + + +## Required permissions + +If you use the Security plugin, make sure you have the appropriate permissions: `cluster:monitor/nodes/info`. diff --git a/_api-reference/nodes-apis/nodes-reload-secure.md b/_api-reference/nodes-apis/nodes-reload-secure.md new file mode 100644 index 00000000..52b2ef67 --- /dev/null +++ b/_api-reference/nodes-apis/nodes-reload-secure.md @@ -0,0 +1,70 @@ +--- +layout: default +title: Nodes reload secure settings +parent: Nodes APIs +nav_order: 50 +--- + +# Nodes reload secure settings +**Introduced 1.0** +{: .label .label-purple } + +The nodes reload secure settings endpoint allows you to change secure settings on a node and reload the secure settings without restarting the node. + +## Path and HTTP methods + +``` +POST _nodes/reload_secure_settings +POST _nodes//reload_secure_settings +``` + +## Path parameter + +You can include the following optional path parameter in your request. + +Parameter | Type | Description +:--- | :--- | :--- +nodeId | String | A comma-separated list of nodeIds used to filter results. Supports [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters). Defaults to `_all`. + +## Request fields + +The request may include an optional object containing the password for the OpenSearch keystore. + +```json +{ + "secure_settings_password": "keystore_password" +} +``` + +#### Example request + +The following is an example API request: + +``` +POST _nodes/reload_secure_settings +``` +{% include copy-curl.html %} + +#### Example response + +The following is an example response: + +```json +{ + "_nodes" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + }, + "cluster_name" : "opensearch-cluster", + "nodes" : { + "t7uqHu4SSuWObK3ElkCRfw" : { + "name" : "opensearch-node1" + } + } +} +``` + +## Required permissions + +If you use the Security plugin, make sure you set the following permissions: `cluster:manage/nodes`. \ No newline at end of file diff --git a/_api-reference/nodes-apis/nodes-stats.md b/_api-reference/nodes-apis/nodes-stats.md new file mode 100644 index 00000000..4fdb5c3c --- /dev/null +++ b/_api-reference/nodes-apis/nodes-stats.md @@ -0,0 +1,1268 @@ +--- +layout: default +title: Nodes stats +parent: Nodes APIs +nav_order: 20 +--- + +# Nodes stats +**Introduced 1.0** +{: .label .label-purple } + +The nodes stats API returns statistics about your cluster. + +## Path and HTTP methods + +```json +GET /_nodes/stats +GET /_nodes//stats +GET /_nodes/stats/ +GET /_nodes//stats/ +GET /_nodes/stats// +GET /_nodes//stats// +``` + +## Path parameters + +The following table lists the available path parameters. All path parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +nodeId | String | A comma-separated list of nodeIds used to filter results. Supports [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters). Defaults to `_all`. +metric | String | A comma-separated list of metric groups that are included in the response. For example, `jvm,fs`. See the following list of all index metrics. Defaults to all metrics. +index_metric | String | A comma-separated list of index metric groups that are included in the response. For example, `docs,store`. See the following list of all index metrics. Defaults to all index metrics. + +The following table lists all available metric groups. + +Metric | Description +:--- |:---- +indices | Index statistics, such as size, document count, and search, index, and delete times for documents. +os | Statistics about the host OS, including load, memory, and swapping. +process | Statistics about processes, including their memory consumption, open file descriptors, and CPU usage. +jvm | Statistics about the JVM, including memory pool, buffer pool, and garbage collection, and the number of loaded classes. +fs | File system statistics, such as read/write statistics, data path, and free disk space. +transport | Transport layer statistics about send/receive in cluster communication. +http | Statistics about the HTTP layer. +breaker | Statistics about the field data circuit breakers. +script | Statistics about scripts, such as compilations and cache evictions. +discovery | Statistics about cluster states. +ingest | Statistics about ingest pipelines. +adaptive_selection | Statistics about adaptive replica selection, which selects an eligible node using shard allocation awareness. +script_cache | Statistics about script cache. +indexing_pressure | Statistics about the node's indexing pressure. +shard_indexing_pressure | Statistics about shard indexing pressure. +resource_usage_stats | Node-level resource usage statistics, such as CPU and JVM memory. +admission_control | Statistics about admission control. + +To filter the information returned for the `indices` metric, you can use specific `index_metric` values. You can use these only when you use the following query types: + +```json +GET _nodes/stats/ +GET _nodes/stats/_all +GET _nodes/stats/indices +``` + +The following index metrics are supported: + +- docs +- store +- indexing +- get +- search +- merge +- refresh +- flush +- warmer +- query_cache +- fielddata +- completion +- segments +- translog +- request_cache + +For example, the following query requests statistics for `docs` and `search`: + +```json +GET _nodes/stats/indices/docs,search +``` +{% include copy-curl.html %} + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +completion_fields | String | The fields to include in completion statistics. Supports comma-separated lists and wildcard expressions. +fielddata_fields | String | The fields to include in fielddata statistics. Supports comma-separated lists and wildcard expressions. +fields | String | The fields to include. Supports comma-separated lists and wildcard expressions. +groups | String | A comma-separated list of search groups to include in the search statistics. +level | String | Specifies whether statistics are aggregated at the cluster, index, or shard level. Valid values are `indices`, `node`, and `shard`. +timeout | Time | Sets the time limit for node response. Default is `30s`. +include_segment_file_sizes | Boolean | If segment statistics are requested, this field specifies to return the aggregated disk usage of every Lucene index file. Default is `false`. + +#### Example request + +```json +GET _nodes/stats/ +``` +{% include copy-curl.html %} + +#### Example response + +Select the arrow to view the example response. + +
+ + Response + + {: .text-delta} + +```json +{ + "_nodes" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + }, + "cluster_name" : "docker-cluster", + "nodes" : { + "F-ByTQzVQ3GQeYzQJArJGQ" : { + "timestamp" : 1664484195257, + "name" : "opensearch", + "transport_address" : "127.0.0.1:9300", + "host" : "127.0.0.1", + "ip" : "127.0.0.1:9300", + "roles" : [ + "cluster_manager", + "data", + "ingest", + "remote_cluster_client" + ], + "attributes" : { + "shard_indexing_pressure_enabled" : "true" + }, + "indices" : { + "docs" : { + "count" : 13160, + "deleted" : 12 + }, + "store" : { + "size_in_bytes" : 6263461, + "reserved_in_bytes" : 0 + }, + "indexing" : { + "index_total" : 0, + "index_time_in_millis" : 0, + "index_current" : 0, + "index_failed" : 0, + "delete_total" : 204, + "delete_time_in_millis" : 427, + "delete_current" : 0, + "noop_update_total" : 0, + "is_throttled" : false, + "throttle_time_in_millis" : 0 + }, + "get" : { + "total" : 4, + "time_in_millis" : 18, + "exists_total" : 4, + "exists_time_in_millis" : 18, + "missing_total" : 0, + "missing_time_in_millis" : 0, + "current" : 0 + }, + "search" : { + "open_contexts": 4, + "query_total": 194, + "query_time_in_millis": 467, + "query_current": 0, + "fetch_total": 194, + "fetch_time_in_millis": 143, + "fetch_current": 0, + "scroll_total": 0, + "scroll_time_in_millis": 0, + "scroll_current": 0, + "point_in_time_total": 0, + "point_in_time_time_in_millis": 0, + "point_in_time_current": 0, + "suggest_total": 0, + "suggest_time_in_millis": 0, + "suggest_current": 0, + "request" : { + "dfs_pre_query" : { + "time_in_millis" : 0, + "current" : 0, + "total" : 0 + }, + "query" : { + "time_in_millis" : 200, + "current" : 2, + "total" : 12 + }, + "fetch" : { + "time_in_millis" : 37, + "current" : 3, + "total" : 4 + }, + "dfs_query" : { + "time_in_millis" : 0, + "current" : 0, + "total" : 0 + }, + "expand" : { + "time_in_millis" : 9, + "current" : 1, + "total" : 0 + }, + "can_match" : { + "time_in_millis" : 0, + "current" : 0, + "total" : 0 + } + } + }, + "merges" : { + "current" : 0, + "current_docs" : 0, + "current_size_in_bytes" : 0, + "total" : 1, + "total_time_in_millis" : 5, + "total_docs" : 12, + "total_size_in_bytes" : 3967, + "total_stopped_time_in_millis" : 0, + "total_throttled_time_in_millis" : 0, + "total_auto_throttle_in_bytes" : 251658240 + }, + "refresh" : { + "total" : 74, + "total_time_in_millis" : 201, + "external_total" : 57, + "external_total_time_in_millis" : 314, + "listeners" : 0 + }, + "flush" : { + "total" : 28, + "periodic" : 28, + "total_time_in_millis" : 1261 + }, + "warmer" : { + "current" : 0, + "total" : 45, + "total_time_in_millis" : 99 + }, + "query_cache" : { + "memory_size_in_bytes" : 0, + "total_count" : 0, + "hit_count" : 0, + "miss_count" : 0, + "cache_size" : 0, + "cache_count" : 0, + "evictions" : 0 + }, + "fielddata" : { + "memory_size_in_bytes" : 356, + "evictions" : 0 + }, + "completion" : { + "size_in_bytes" : 0, + "fields" : { } + }, + "segments" : { + "count" : 17, + "memory_in_bytes" : 0, + "terms_memory_in_bytes" : 0, + "stored_fields_memory_in_bytes" : 0, + "term_vectors_memory_in_bytes" : 0, + "norms_memory_in_bytes" : 0, + "points_memory_in_bytes" : 0, + "doc_values_memory_in_bytes" : 0, + "index_writer_memory_in_bytes" : 0, + "version_map_memory_in_bytes" : 0, + "fixed_bit_set_memory_in_bytes" : 288, + "max_unsafe_auto_id_timestamp" : -1, + "remote_store" : { + "upload" : { + "total_upload_size" : { + "started_bytes" : 152419, + "succeeded_bytes" : 152419, + "failed_bytes" : 0 + }, + "refresh_size_lag" : { + "total_bytes" : 0, + "max_bytes" : 0 + }, + "max_refresh_time_lag_in_millis" : 0, + "total_time_spent_in_millis" : 516, + "pressure" : { + "total_rejections" : 0 + } + }, + "download" : { + "total_download_size" : { + "started_bytes" : 0, + "succeeded_bytes" : 0, + "failed_bytes" : 0 + }, + "total_time_spent_in_millis" : 0 + } + }, + "file_sizes" : { } + }, + "translog" : { + "operations" : 12, + "size_in_bytes" : 1452, + "uncommitted_operations" : 12, + "uncommitted_size_in_bytes" : 1452, + "earliest_last_modified_age" : 164160, + "remote_store" : { + "upload" : { + "total_uploads" : { + "started" : 57, + "failed" : 0, + "succeeded" : 57 + }, + "total_upload_size" : { + "started_bytes" : 16830, + "failed_bytes" : 0, + "succeeded_bytes" : 16830 + } + } + } + }, + "request_cache" : { + "memory_size_in_bytes" : 1649, + "evictions" : 0, + "hit_count" : 0, + "miss_count" : 18 + }, + "recovery" : { + "current_as_source" : 0, + "current_as_target" : 0, + "throttle_time_in_millis" : 0 + } + }, + "os" : { + "timestamp" : 1664484195263, + "cpu" : { + "percent" : 0, + "load_average" : { + "1m" : 0.0, + "5m" : 0.0, + "15m" : 0.0 + } + }, + "mem" : { + "total_in_bytes" : 13137076224, + "free_in_bytes" : 9265442816, + "used_in_bytes" : 3871633408, + "free_percent" : 71, + "used_percent" : 29 + }, + "swap" : { + "total_in_bytes" : 4294967296, + "free_in_bytes" : 4294967296, + "used_in_bytes" : 0 + }, + "cgroup" : { + "cpuacct" : { + "control_group" : "/", + "usage_nanos" : 338710071600 + }, + "cpu" : { + "control_group" : "/", + "cfs_period_micros" : 100000, + "cfs_quota_micros" : -1, + "stat" : { + "number_of_elapsed_periods" : 0, + "number_of_times_throttled" : 0, + "time_throttled_nanos" : 0 + } + }, + "memory" : { + "control_group" : "/", + "limit_in_bytes" : "9223372036854771712", + "usage_in_bytes" : "1432346624" + } + } + }, + "process" : { + "timestamp" : 1664484195263, + "open_file_descriptors" : 556, + "max_file_descriptors" : 65536, + "cpu" : { + "percent" : 0, + "total_in_millis" : 170870 + }, + "mem" : { + "total_virtual_in_bytes" : 6563344384 + } + }, + "jvm" : { + "timestamp" : 1664484195264, + "uptime_in_millis" : 21232111, + "mem" : { + "heap_used_in_bytes" : 308650480, + "heap_used_percent" : 57, + "heap_committed_in_bytes" : 536870912, + "heap_max_in_bytes" : 536870912, + "non_heap_used_in_bytes" : 147657128, + "non_heap_committed_in_bytes" : 152502272, + "pools" : { + "young" : { + "used_in_bytes" : 223346688, + "max_in_bytes" : 0, + "peak_used_in_bytes" : 318767104, + "peak_max_in_bytes" : 0, + "last_gc_stats" : { + "used_in_bytes" : 0, + "max_in_bytes" : 0, + "usage_percent" : -1 + } + }, + "old" : { + "used_in_bytes" : 67068928, + "max_in_bytes" : 536870912, + "peak_used_in_bytes" : 67068928, + "peak_max_in_bytes" : 536870912, + "last_gc_stats" : { + "used_in_bytes" : 34655744, + "max_in_bytes" : 536870912, + "usage_percent" : 6 + } + }, + "survivor" : { + "used_in_bytes" : 18234864, + "max_in_bytes" : 0, + "peak_used_in_bytes" : 32721280, + "peak_max_in_bytes" : 0, + "last_gc_stats" : { + "used_in_bytes" : 18234864, + "max_in_bytes" : 0, + "usage_percent" : -1 + } + } + } + }, + "threads" : { + "count" : 80, + "peak_count" : 80 + }, + "gc" : { + "collectors" : { + "young" : { + "collection_count" : 18, + "collection_time_in_millis" : 199 + }, + "old" : { + "collection_count" : 0, + "collection_time_in_millis" : 0 + } + } + }, + "buffer_pools" : { + "mapped" : { + "count" : 23, + "used_in_bytes" : 6232113, + "total_capacity_in_bytes" : 6232113 + }, + "direct" : { + "count" : 63, + "used_in_bytes" : 9050069, + "total_capacity_in_bytes" : 9050068 + }, + "mapped - 'non-volatile memory'" : { + "count" : 0, + "used_in_bytes" : 0, + "total_capacity_in_bytes" : 0 + } + }, + "classes" : { + "current_loaded_count" : 20693, + "total_loaded_count" : 20693, + "total_unloaded_count" : 0 + } + }, + "thread_pool" : { + "OPENSEARCH_ML_TASK_THREAD_POOL" : { + "threads" : 0, + "queue" : 0, + "active" : 0, + "rejected" : 0, + "largest" : 0, + "completed" : 0 + }, + "ad-batch-task-threadpool" : { + "threads" : 0, + "queue" : 0, + "active" : 0, + "rejected" : 0, + "largest" : 0, + "completed" : 0 + }, + ... + }, + "fs" : { + "timestamp" : 1664484195264, + "total" : { + "total_in_bytes" : 269490393088, + "free_in_bytes" : 261251477504, + "available_in_bytes" : 247490805760 + }, + "data" : [ + { + "path" : "/usr/share/opensearch/data/nodes/0", + "mount" : "/ (overlay)", + "type" : "overlay", + "total_in_bytes" : 269490393088, + "free_in_bytes" : 261251477504, + "available_in_bytes" : 247490805760 + } + ], + "io_stats" : { } + }, + "transport" : { + "server_open" : 0, + "total_outbound_connections" : 0, + "rx_count" : 0, + "rx_size_in_bytes" : 0, + "tx_count" : 0, + "tx_size_in_bytes" : 0 + }, + "http" : { + "current_open" : 5, + "total_opened" : 1108 + }, + "breakers" : { + "request" : { + "limit_size_in_bytes" : 322122547, + "limit_size" : "307.1mb", + "estimated_size_in_bytes" : 0, + "estimated_size" : "0b", + "overhead" : 1.0, + "tripped" : 0 + }, + "fielddata" : { + "limit_size_in_bytes" : 214748364, + "limit_size" : "204.7mb", + "estimated_size_in_bytes" : 356, + "estimated_size" : "356b", + "overhead" : 1.03, + "tripped" : 0 + }, + "in_flight_requests" : { + "limit_size_in_bytes" : 536870912, + "limit_size" : "512mb", + "estimated_size_in_bytes" : 0, + "estimated_size" : "0b", + "overhead" : 2.0, + "tripped" : 0 + }, + "parent" : { + "limit_size_in_bytes" : 510027366, + "limit_size" : "486.3mb", + "estimated_size_in_bytes" : 308650480, + "estimated_size" : "294.3mb", + "overhead" : 1.0, + "tripped" : 0 + } + }, + "script" : { + "compilations" : 0, + "cache_evictions" : 0, + "compilation_limit_triggered" : 0 + }, + "discovery" : { + "cluster_state_queue" : { + "total" : 0, + "pending" : 0, + "committed" : 0 + }, + "published_cluster_states" : { + "full_states" : 2, + "incompatible_diffs" : 0, + "compatible_diffs" : 10 + }, + "cluster_state_stats" : { + "overall" : { + "update_count" : 9, + "total_time_in_millis" : 807, + "failed_count" : 0 + }, + "remote_upload" : { + "success_count" : 9, + "failed_count" : 0, + "total_time_in_millis" : 116, + "cleanup_attempt_failed_count" : 0 + } + } + }, + "ingest" : { + "total" : { + "count" : 0, + "time_in_millis" : 0, + "current" : 0, + "failed" : 0 + }, + "pipelines" : { } + }, + "search_pipeline" : { + "total_request" : { + "count" : 5, + "time_in_millis" : 158, + "current" : 0, + "failed" : 0 + }, + "total_response" : { + "count" : 2, + "time_in_millis" : 1, + "current" : 0, + "failed" : 0 + }, + "pipelines" : { + "public_info" : { + "request" : { + "count" : 3, + "time_in_millis" : 71, + "current" : 0, + "failed" : 0 + }, + "response" : { + "count" : 0, + "time_in_millis" : 0, + "current" : 0, + "failed" : 0 + }, + "request_processors" : [ + { + "filter_query:abc" : { + "type" : "filter_query", + "stats" : { + "count" : 1, + "time_in_millis" : 0, + "current" : 0, + "failed" : 0 + } + } + }, + ] + ... + "response_processors" : [ + { + "rename_field" : { + "type" : "rename_field", + "stats" : { + "count" : 2, + "time_in_millis" : 1, + "current" : 0, + "failed" : 0 + } + } + } + ] + }, + ... + } + }, + "adaptive_selection" : { + "F-ByTQzVQ3GQeYzQJArJGQ" : { + "outgoing_searches" : 0, + "avg_queue_size" : 0, + "avg_service_time_ns" : 501024, + "avg_response_time_ns" : 794105, + "rank" : "0.8" + } + }, + "script_cache" : { + "sum" : { + "compilations" : 0, + "cache_evictions" : 0, + "compilation_limit_triggered" : 0 + }, + "contexts" : [ + { + "context" : "aggregation_selector", + "compilations" : 0, + "cache_evictions" : 0, + "compilation_limit_triggered" : 0 + }, + { + "context" : "aggs", + "compilations" : 0, + "cache_evictions" : 0, + "compilation_limit_triggered" : 0 + }, + ... + ] + }, + "indexing_pressure" : { + "memory" : { + "current" : { + "combined_coordinating_and_primary_in_bytes" : 0, + "coordinating_in_bytes" : 0, + "primary_in_bytes" : 0, + "replica_in_bytes" : 0, + "all_in_bytes" : 0 + }, + "total" : { + "combined_coordinating_and_primary_in_bytes" : 40256, + "coordinating_in_bytes" : 40256, + "primary_in_bytes" : 45016, + "replica_in_bytes" : 0, + "all_in_bytes" : 40256, + "coordinating_rejections" : 0, + "primary_rejections" : 0, + "replica_rejections" : 0 + }, + "limit_in_bytes" : 53687091 + } + }, + "shard_indexing_pressure" : { + "stats" : { }, + "total_rejections_breakup_shadow_mode" : { + "node_limits" : 0, + "no_successful_request_limits" : 0, + "throughput_degradation_limits" : 0 + }, + "enabled" : false, + "enforced" : false + }, + "resource_usage_stats": { + "nxLWtMdXQmWA-ZBVWU8nwA": { + "timestamp": 1698401391000, + "cpu_utilization_percent": "0.1", + "memory_utilization_percent": "3.9" + } + }, + "admission_control": { + "global_cpu_usage": { + "transport": { + "rejection_count": { + "search": 3, + "indexing": 1 + } + } + } + } + } + } +} +``` +
+ +## Response fields + +The following table lists all response fields. + +| Field | Data type | Description | +| :--- | :--- | :--- | +| _nodes | Object | Statistics about the nodes that are returned. | +| _nodes.total | Integer | The total number of nodes for this request. | +| _nodes.successful | Integer | The number of nodes for which the request was successful. | +| _nodes.failed | Integer | The number of nodes for which the request failed. If there are nodes for which the request failed, the failure message is included. | +| cluster_name | String | The name of the cluster. | +| [nodes](#nodes) | Object | Statistics for the nodes included in this request. | + +### `nodes` + +The `nodes` object contains all nodes that are returned by the request, along with their IDs. Each node has the following properties. + +Field | Data type | Description +:--- | :--- | :--- +timestamp | Integer | The time the nodes statistics were collected, in milliseconds since the epoch. +name | String | The name of the node. +transport_address | IP address | The host and port of the transport layer that is used by nodes in a cluster to communicate internally. +host | IP address | The network host of the node. +ip | IP address | The IP address and port of the node. +roles | Array | The roles of the node (for example, `cluster_manager`, `data`, or `ingest`). +attributes | Object | The attributes of the node (for example, `shard_indexing_pressure_enabled`). +[indices](#indices) | Object | Index statistics for each index that has shards on the node. +[os](#os) | Object | Statistics about the OS for the node. +[process](#process) | Object | Process statistics for the node. +[jvm](#jvm) | Object | Statistics about the JVM for the node. +[thread_pool](#thread_pool)| Object | Statistics about each thread pool for the node. +[fs](#fs) | Object | Statistics about the file stores for the node. +[transport](#transport) | Object | Transport statistics for the node. +http | Object | HTTP statistics for the node. +http.current_open | Integer | The number of currently open HTTP connections for the node. +http.total_opened | Integer | The total number of HTTP connections the node has opened since it started. +[breakers](#breakers) | Object | Statistics about the circuit breakers for the node. +[script](#script-and-script_cache)| Object | Script statistics for the node. +[script_cache](#script-and-script_cache)| Object | Script cache statistics for the node. +[discovery](#discovery) | Object | Node discovery statistics for the node. +[ingest](#ingest) | Object | Ingest statistics for the node. +[search_pipeline](#search_pipeline) | Object | Statistics related to [search pipelines]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/). +[adaptive_selection](#adaptive_selection) | Object | Statistics about adaptive selections for the node. +[indexing_pressure](#indexing_pressure) | Object | Statistics related to the node's indexing pressure. +[shard_indexing_pressure](#shard_indexing_pressure) | Object | Statistics related to indexing pressure at the shard level. +[search_backpressure]({{site.url}}{{site.baseurl}}/opensearch/search-backpressure#search-backpressure-stats-api) | Object | Statistics related to search backpressure. +[resource_usage_stats](#resource_usage_stats) | Object | Statistics related to resource usage for the node. +[admission_control](#admission_control) | Object | Statistics related to admission control for the node. + +### `indices` + +The `indices` object contains the index statistics for each index with shards on this node. Each index has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +docs | Object | Document statistics for all primary shards that exist on the node. +docs.count | Integer | The number of documents reported by Lucene. Excludes deleted documents and recently indexed documents that are not yet assigned to a segment. Nested documents are counted separately. +docs.deleted | Integer | The number of deleted documents reported by Lucene. Excludes recent deletion operations that have not yet affect the segment. +store | Object | Statistics about the shard sizes of the shards on the node. +store.size_in_bytes | Integer | Total size of all shards on the node. +store.reserved_in_bytes | Integer | The predicted number of bytes the shard store will grow to be because of activities such as restoring snapshots and peer recoveries. +indexing | Object | Statistics about indexing operations for the node. +indexing.index_total | Integer | The total number of indexing operations on the node. +indexing.index_time_in_millis | Integer | The total time for all indexing operations, in milliseconds. +indexing.index_current | Integer | The number of indexing operations that are currently running. +indexing.index_failed | Integer | The number of indexing operations that have failed. +indexing.delete_total | Integer | The total number of deletions. +indexing.delete_time_in_millis | Integer | The total time for all deletion operations, in milliseconds. +indexing.delete_current | Integer | The number of deletion operations that are currently running. +indexing.noop_update_total | Integer | The total number of noop operations. +indexing.is_throttled | Boolean | Specifies whether any operations were throttled. +indexing.throttle_time_in_millis | Integer | The total time for throttling operations, in milliseconds. +get | Object | Statistics about the get operations for the node. +get.total | Integer | The total number of get operations. +get.time_in_millis | Integer | The total time for all get operations, in milliseconds. +get.exists_total | Integer | The total number of successful get operations. +get.exists_time_in_millis | Integer | The total time for all successful get operations, in milliseconds. +get.missing_total | Integer | The number of failed get operations. +get.missing_time_in_millis | Integer | The total time for all failed get operations, in milliseconds. +get.current | Integer | The number of get operations that are currently running. +search | Object | Statistics about the search operations for the node. +search.concurrent_avg_slice_count | Integer | The average slice count of all search requests. This is computed as the total slice count divided by the total number of concurrent search requests. +search.concurrent_query_total |Integer | The total number of query operations that use concurrent segment search. +search.concurrent_query_time_in_millis | Integer | The total amount of time taken by all query operations that use concurrent segment search, in milliseconds. +search.concurrent_query_current |Integer | The number of currently running query operations that use concurrent segment search. +search.open_contexts | Integer | The number of open search contexts. +search.query_total | Integer | The total number of shard query operations. +search.query_time_in_millis | Integer | The total amount of time for all shard query operations, in milliseconds. +search.query_current | Integer | The number of shard query operations that are currently running. +search.fetch_total | Integer | The total number of shard fetch operations. +search.fetch_time_in_millis | Integer | The total amount of time for all shard fetch operations, in milliseconds. +search.fetch_current | Integer | The number of shard fetch operations that are currently running. +search.scroll_total | Integer | The total number of shard scroll operations. +search.scroll_time_in_millis | Integer | The total amount of time for all shard scroll operations, in milliseconds. +search.scroll_current | Integer | The number of shard scroll operations that are currently running. +search.point_in_time_total | Integer | The total number of shard Point in Time (PIT) contexts that have been created (completed and active) since the node last restarted. +search.point_in_time_time_in_millis | Integer | The amount of time that shard PIT contexts have been held open since the node last restarted, in milliseconds. +search.point_in_time_current | Integer | The number of shard PIT contexts currently open. +search.suggest_total | Integer | The total number of shard suggest operations. +search.suggest_time_in_millis | Integer | The total amount of time for all shard suggest operations, in milliseconds. +search.suggest_current | Integer | The number of shard suggest operations that are currently running. +search.request | Object | Statistics about coordinator search operations for the node. +search.request.dfs_pre_query.time_in_millis | Integer | The total amount of time for all coordinator depth-first search (DFS) prequery operations, in milliseconds. +search.request.dfs_pre_query.current | Integer | The number of coordinator DFS prequery operations that are currently running. +search.request.dfs_pre_query.total | Integer | The total number of coordinator DFS prequery operations. +search.request.query.time_in_millis | Integer | The total amount of time for all coordinator query operations, in milliseconds. +search.request.query.current | Integer | The number of coordinator query operations that are currently running. +search.request.query.total | Integer | The total number of coordinator query operations. +search.request.fetch.time_in_millis | Integer | The total amount of time for all coordinator fetch operations, in milliseconds. +search.request.fetch.current | Integer | The number of coordinator fetch operations that are currently running. +search.request.fetch.total | Integer | The total number of coordinator fetch operations. +search.request.dfs_query.time_in_millis | Integer | The total amount of time for all coordinator DFS prequery operations, in milliseconds. +search.request.dfs_query.current | Integer | The number of coordinator DFS prequery operations that are currently running. +search.request.dfs_query.total | Integer | The total number of coordinator DFS prequery operations. +search.request.expand.time_in_millis | Integer | The total amount of time for all coordinator expand operations, in milliseconds. +search.request.expand.current | Integer | The number of coordinator expand operations that are currently running. +search.request.expand.total | Integer | The total number of coordinator expand operations. +search.request.can_match.time_in_millis | Integer | The total amount of time for all coordinator match operations, in milliseconds. +search.request.can_match.current | Integer | The number of coordinator match operations that are currently running. +search.request.can_match.total | Integer | The total number of coordinator match operations. +merges | Object | Statistics about merge operations for the node. +merges.current | Integer | The number of merge operations that are currently running. +merges.current_docs | Integer | The number of document merges that are currently running. +merges.current_size_in_bytes | Integer | The memory size, in bytes, that is used to perform current merge operations. +merges.total | Integer | The total number of merge operations. +merges.total_time_in_millis | Integer | The total time for merges, in milliseconds. +merges.total_docs | Integer | The total number of documents that have been merged. +merges.total_size_in_bytes | Integer | The total size of all merged documents, in bytes. +merges.total_stopped_time_in_millis | Integer | The total time spent on stopping merge operations, in milliseconds. +merges.total_throttled_time_in_millis | Integer | The total time spent on throttling merge operations, in milliseconds. +merges.total_auto_throttle_in_bytes | Integer | The total size of automatically throttled merge operations, in bytes. +refresh | Object | Statistics about refresh operations for the node. +refresh.total | Integer | The total number of refresh operations. +refresh.total_time_in_millis | Integer | The total time for all refresh operations, in milliseconds. +refresh.external_total | Integer | The total number of external refresh operations. +refresh.external_total_time_in_millis | Integer | The total time for all external refresh operations, in milliseconds. +refresh.listeners | Integer | The number of refresh listeners. +flush | Object | Statistics about flush operations for the node. +flush.total | Integer | The total number of flush operations. +flush.periodic | Integer | The total number of periodic flush operations. +flush.total_time_in_millis | Integer | The total time for all flush operations, in milliseconds. +warmer | Object | Statistics about the index warming operations for the node. +warmer.current | Integer | The number of current index warming operations. +warmer.total | Integer | The total number of index warming operations. +warmer.total_time_in_millis | Integer | The total time for all index warming operations, in milliseconds. +query_cache | Statistics about query cache operations for the node. +query_cache.memory_size_in_bytes | Integer | The amount of memory used for the query cache for all shards in the node. +query_cache.total_count | Integer | The total number of hits, misses, and cached queries in the query cache. +query_cache.hit_count | Integer | The total number of hits in the query cache. +query_cache.miss_count | Integer | The total number of misses in the query cache. +query_cache.cache_size | Integer | The size of the query cache, in bytes. +query_cache.cache_count | Integer | The number of queries in the query cache. +query_cache.evictions | Integer | The number of evictions in the query cache. +fielddata | Object | Statistics about the field data cache for all shards in the node. +fielddata.memory_size_in_bytes | Integer | The total amount of memory used for the field data cache for all shards in the node. +fielddata.evictions | Integer | The number of evictions in the field data cache. +fielddata.fields | Object | Contains all field data fields. +completion | Object | Statistics about completions for all shards in the node. +completion.size_in_bytes | Integer | The total amount of memory used for completion for all shards in the node, in bytes. +completion.fields | Object | Contains completion fields. +segments | Object | Statistics about segments for all shards in the node. +segments.count | Integer | The total number of segments. +segments.memory_in_bytes | Integer | The total amount of memory, in bytes. +segments.terms_memory_in_bytes | Integer | The total amount of memory used for terms, in bytes. +segments.stored_fields_memory_in_bytes | Integer | The total amount of memory used for stored fields, in bytes. +segments.term_vectors_memory_in_bytes | Integer | The total amount of memory used for term vectors, in bytes. +segments.norms_memory_in_bytes | Integer | The total amount of memory used for normalization factors, in bytes. +segments.points_memory_in_bytes | Integer | The total amount of memory used for points, in bytes. +segments.doc_values_memory_in_bytes | Integer | The total amount of memory used for doc values, in bytes. +segments.index_writer_memory_in_bytes | Integer | The total amount of memory used by all index writers, in bytes. +segments.version_map_memory_in_bytes | Integer | The total amount of memory used by all version maps, in bytes. +segments.fixed_bit_set_memory_in_bytes | Integer | The total amount of memory used by fixed bit sets, in bytes. Fixed bit sets are used for nested objects and join fields. +segments.max_unsafe_auto_id_timestamp | Integer | The timestamp for the most recently retired indexing request, in milliseconds since the epoch. +segments.segment_replication | Object | Segment replication statistics for all primary shards when segment replication is enabled on the node. +segments.segment_replication.max_bytes_behind | long | The maximum number of bytes behind the primary replica. +segments.segment_replication.total_bytes_behind | long | The total number of bytes behind the primary replicas. +segments.segment_replication.max_replication_lag | long | The maximum amount of time, in milliseconds, taken by a replica to catch up to its primary. +segments.remote_store | Object | Statistics about remote segment store operations. +segments.remote_store.upload | Object | Statistics related to uploads to the remote segment store. +segments.remote_store.upload.total_upload_size | Object | The amount of data, in bytes, uploaded to the remote segment store. +segments.remote_store.upload.total_upload_size.started_bytes | Integer | The number of bytes to upload to the remote segment store after the upload has started. +segments.remote_store.upload.total_upload_size.succeeded_bytes | Integer | The number of bytes successfully uploaded to the remote segment store. +segments.remote_store.upload.total_upload_size.failed_bytes | Integer | The number of bytes that failed to upload to the remote segment store. +segments.remote_store.upload.refresh_size_lag | Object | The amount of lag during upload between the remote segment store and the local store. +segments.remote_store.upload.refresh_size_lag.total_bytes | Integer | The total number of bytes that lagged during the upload refresh between the remote segment store and the local store. +segments.remote_store.upload.refresh_size_lag.max_bytes | Integer | The maximum amount of lag, in bytes, during the upload refresh between the remote segment store and the local store. +segments.remote_store.upload.max_refresh_time_lag_in_millis | Integer | The maximum duration, in milliseconds, that the remote refresh is behind the local refresh. +segments.remote_store.upload.total_time_spent_in_millis | Integer | The total amount of time, in milliseconds, spent on uploads to the remote segment store. +segments.remote_store.upload.pressure | Object | Statistics related to segment store upload backpressure. +segments.remote_store.upload.pressure.total_rejections | Integer | The total number of requests rejected due to segment store upload backpressure. +segments.remote_store.download | Object | Statistics related to downloads to the remote segment store. +segments.remote_store.download.total_download_size | Object | The total amount of data download from the remote segment store. +segments.remote_store.download.total_download_size.started_bytes | Integer | The number of bytes downloaded from the remote segment store after the download starts. +segments.remote_store.download.total_download_size.succeeded_bytes | Integer | The number of bytes successfully downloaded from the remote segment store. +segments.remote_store.download.total_download_size.failed_bytes | Integer | The number of bytes that failed to download from the remote segment store. +segments.remote_store.download.total_time_spent_in_millis | Integer | The total duration, in milliseconds, spent on downloads from the remote segment store. +segments.file_sizes | Integer | Statistics about the size of the segment files. +translog | Object | Statistics about transaction log operations for the node. +translog.operations | Integer | The number of translog operations. +translog.size_in_bytes | Integer | The size of the translog, in bytes. +translog.uncommitted_operations | Integer | The number of uncommitted translog operations. +translog.uncommitted_size_in_bytes | Integer | The size of uncommitted translog operations, in bytes. +translog.earliest_last_modified_age | Integer | The earliest last modified age for the translog. +translog.remote_store | Object | Statistics related to operations from the remote translog store. +translog.remote_store.upload | Object | Statistics related to uploads to the remote translog store. +translog.remote_store.upload.total_uploads | Object | The number of syncs to the remote translog store. +translog.remote_store.upload.total_uploads.started | Integer | The number of upload syncs to the remote translog store that have started. +translog.remote_store.upload.total_uploads.failed | Integer | The number of failed upload syncs to the remote translog store. +translog.remote_store.upload.total_uploads.succeeded | Integer | The number of successful upload syncs to the remote translog store. +translog.remote_store.upload.total_upload_size | Object | The total amount of data uploaded to the remote translog store. +translog.remote_store.upload.total_upload_size.started_bytes | Integer | The number of bytes actively uploading to the remote translog store after the upload has started. +translog.remote_store.upload.total_upload_size.failed_bytes | Integer | The number of bytes that failed to upload to the remote translog store. +translog.remote_store.upload.total_upload_size.succeeded_bytes | Integer | The number of bytes successfully uploaded to the remote translog store. +request_cache | Object | Statistics about the request cache for the node. +request_cache.memory_size_in_bytes | Integer | The memory size used by the request cache, in bytes. +request_cache.evictions | Integer | The number of request cache evictions. +request_cache.hit_count | Integer | The number of request cache hits. +request_cache.miss_count | Integer | The number of request cache misses. +recovery | Object | Statistics about recovery operations for the node. +recovery.current_as_source | Integer | The number of recovery operations that have used an index shard as a source. +recovery.current_as_target | Integer | The number of recovery operations that have used an index shard as a target. +recovery.throttle_time_in_millis | Integer | The delay of recovery operations due to throttling, in milliseconds. + +### `os` + +The `os` object has the OS statistics for the node and has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +timestamp | Integer | The last refresh time for the OS statistics, in milliseconds since the epoch. +cpu | Object | Statistics about the node's CPU usage. +cpu.percent | Integer | Recent CPU usage for the system. +cpu.load_average | Object | Statistics about load averages for the system. +cpu.load_average.1m | Float | The load average for the system for the time period of one minute. +cpu.load_average.5m | Float | The load average for the system for the time period of five minutes. +cpu.load_average.15m | Float | The load average for the system for the time period of 15 minutes. +cpu.mem | Object | Statistics about memory usage for the node. +cpu.mem.total_in_bytes | Integer | The total amount of physical memory, in bytes. +cpu.mem.free_in_bytes | Integer | The total amount of free physical memory, in bytes. +cpu.mem.used_in_bytes | Integer | The total amount of used physical memory, in bytes. +cpu.mem.free_percent | Integer | The percentage of memory that is free. +cpu.mem.used_percent | Integer | The percentage of memory that is used. +cpu.swap | Object | Statistics about swap space for the node. +cpu.swap.total_in_bytes | Integer | The total amount of swap space, in bytes. +cpu.swap.free_in_bytes | Integer | The total amount of free swap space, in bytes. +cpu.swap.used_in_bytes | Integer | The total amount of used swap space, in bytes. +cpu.cgroup | Object | Contains cgroup statistics for the node. Returned for Linux only. +cpu.cgroup.cpuacct | Object | Statistics about the cpuacct control group for the node. +cpu.cgroup.cpu | Object | Statistics about the CPU control group for the node. +cpu.cgroup.memory | Object | Statistics about the memory control group for the node. + +### `process` + +The `process` object contains process statistics for the node and has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +timestamp | Integer | The last refresh time for the process statistics, in milliseconds since the epoch. +open_file_descriptors | Integer | The number of opened file descriptors associated with the current process. +max_file_descriptors | Integer | The maximum number of file descriptors for the system. +cpu | Object | Statistics about the CPU for the node. +cpu.percent | Integer | The percentage of CPU usage for the process. +cpu.total_in_millis | Integer | The total CPU time used by the process on which the JVM is running, in milliseconds. +mem | Object | Statistics about the memory for the node. +mem.total_virtual_in_bytes | Integer | The total amount of virtual memory that is guaranteed to be available to the process that is currently running, in bytes. + +### `jvm` + +The `jvm` object contains statistics about the JVM for the node and has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +timestamp | Integer | The last refresh time for the JVM statistics, in milliseconds since the epoch. +uptime_in_millis | Integer | The JVM uptime, in milliseconds. +mem | Object | Statistics for the JVM memory usage on the node. +mem.heap_used_in_bytes | Integer | The amount of memory that is currently being used, in bytes. +mem.heap_used_percent | Integer | The percentage of memory that is currently used by the heap. +mem.heap_committed_in_bytes | Integer | The amount of memory available for use by the heap, in bytes. +mem.heap_max_in_bytes | Integer | The maximum amount of memory available for use by the heap, in bytes. +mem.non_heap_used_in_bytes | Integer | The amount of non-heap memory that is currently used, in bytes. +mem.non_heap_committed_in_bytes | Integer | The maximum amount of non-heap memory available for use, in bytes. +mem.pools | Object | Statistics about heap memory usage for the node. +mem.pools.young | Object | Statistics about the young generation heap memory usage for the node. Contains the amount of memory used, the maximum amount of memory available, and the peak amount of memory used. +mem.pools.old | Object | Statistics about the old generation heap memory usage for the node. Contains the amount of memory used, the maximum amount of memory available, and the peak amount of memory used. +mem.pools.survivor | Object | Statistics about the survivor space memory usage for the node. Contains the amount of memory used, the maximum amount of memory available, and the peak amount of memory used. +threads | Object | Statistics about the JVM thread usage for the node. +threads.count | Integer | The number of threads that are currently active in the JVM. +threads.peak_count | Integer | The maximum number of threads in the JVM. +gc.collectors | Object | Statistics about the JVM garbage collectors for the node. +gc.collectors.young | Integer | Statistics about JVM garbage collectors that collect young generation objects. +gc.collectors.young.collection_count | Integer | The number of garbage collectors that collect young generation objects. +gc.collectors.young.collection_time_in_millis | Integer | The total time spent on garbage collection of young generation objects, in milliseconds. +gc.collectors.old | Integer | Statistics about JVM garbage collectors that collect old generation objects. +gc.collectors.old.collection_count | Integer | The number of garbage collectors that collect old generation objects. +gc.collectors.old.collection_time_in_millis | Integer | The total time spent on garbage collection of old generation objects, in milliseconds. +buffer_pools | Object | Statistics about the JVM buffer pools for the node. +buffer_pools.mapped | Object | Statistics about the mapped JVM buffer pools for the node. +buffer_pools.mapped.count | Integer | The number of mapped buffer pools. +buffer_pools.mapped.used_in_bytes | Integer | The amount of memory used by mapped buffer pools, in bytes. +buffer_pools.mapped.total_capacity_in_bytes | Integer | The total capacity of the mapped buffer pools, in bytes. +buffer_pools.direct | Object | Statistics about the direct JVM buffer pools for the node. +buffer_pools.direct.count | Integer | The number of direct buffer pools. +buffer_pools.direct.used_in_bytes | Integer | The amount of memory used by direct buffer pools, in bytes. +buffer_pools.direct.total_capacity_in_bytes | Integer | The total capacity of the direct buffer pools, in bytes. +classes | Object | Statistics about the classes loaded by the JVM for the node. +classes.current_loaded_count | Integer | The number of classes currently loaded by the JVM. +classes.total_loaded_count | Integer | The total number of classes loaded by the JVM since it started. +classes.total_unloaded_count | Integer | The total number of classes unloaded by the JVM since it started. + +### `thread_pool` + +The `thread_pool` object contains a list of all thread pools. Each thread pool is a nested object that is specified by its ID and contains the following properties. + +Field | Field type | Description +:--- | :--- | :--- +threads | Integer | The number of threads in the pool. +queue | Integer | The number of threads in queue. +active | Integer | The number of active threads in the pool. +rejected | Integer | The number of tasks that have been rejected. +largest | Integer | The peak number of threads in the pool. +completed | Integer | The number of tasks completed. +total_wait_time | Integer | The total amount of time tasks spent waiting in the thread pool queue. Currently, only `search`, `search_throttled`, and `index_searcher` thread pools support this metric. + +### `fs` + +The `fs` object represents statistics about the file stores for the node. It has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +timestamp | Integer | The last refresh time for the file store statistics, in milliseconds since the epoch. +total | Object | Statistics for all file stores of the node. +total.total_in_bytes | Integer | The total memory size of all file stores, in bytes. +total.free_in_bytes | Integer | The total unallocated disk space in all file stores, in bytes. +total.available_in_bytes | Integer | The total disk space available to the JVM on all file stores. Represents the actual amount of memory, in bytes, that OpenSearch can use. +data | Array | The list of all file stores. Each file store has the following properties. +data.path | String | The path to the file store. +data.mount | String | The mount point of the file store. +data.type | String | The type of the file store (for example, overlay). +data.total_in_bytes | Integer | The total size of the file store, in bytes. +data.free_in_bytes | Integer | The total unallocated disk space in the file store, in bytes. +data.available_in_bytes | Integer | The total amount of disk space available to the JVM for the file store, in bytes. +io_stats | Object | I/O statistics for the node (Linux only). Includes devices, read and write operations, and the I/O operation time. + +### `transport` + +The `transport` object has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +server_open | Integer | The number of open inbound TCP connections that OpenSearch nodes use for internal communication. +total_outbound_connections | Integer | The total number of outbound transport connections that the node has opened since it started. +rx_count | Integer | The total number of RX (receive) packets the node received during internal communication. +rx_size_in_bytes | Integer | The total size of RX packets the node received during internal communication, in bytes. +tx_count | Integer | The total number of TX (transmit) packets the node sent during internal communication. +tx_size_in_bytes | Integer | The total size of TX (transmit) packets the node sent during internal communication, in bytes. + +### `breakers` + +The `breakers` object contains statistics about the circuit breakers for the node. Each circuit breaker is a nested object listed by name and contains the following properties. + +Field | Field type | Description +:--- | :--- | :--- +limit_size_in_bytes | Integer | The memory limit for the circuit breaker, in bytes. +limit_size | Byte value | The memory limit for the circuit breaker in human-readable format (for example, `307.1mb`). +estimated_size_in_bytes | Integer | The estimated memory used for the operation, in bytes. +estimated_size | Byte value | The estimated memory used for the operation in human-readable format (for example, `356b`). +overhead | Float | A factor that all estimates are multiplied by to calculate the final estimate. +tripped | Integer | The total number of times the circuit breaker has been activated to prevent an out-of-memory error. + +### `script` and `script_cache` + +The `script` and `script_cache` objects have the following properties. + +Field | Field type | Description +:--- | :--- | :--- +script | Object | Script statistics for the node. +script.compilations | Integer | The total number of script compilations for the node. +script.cache_evictions| Integer | The total number of times the script cache has purged old data. +script.compilation_limit_triggered | Integer | The total number of times script compilation was limited by a circuit breaker. +script_cache | Object | Script cache statistics for the node. +script_cache.sum.compilations | Integer | The total number of script compilations in the cache for the node. +script_cache.sum.cache_evictions| Integer | The total number of times the script cache has purged old data. +script_cache.sum.compilation_limit_triggered | Integer | The total number of times script compilation in the cache was limited by a circuit breaker. +script_cache.contexts | Array of objects | The list of contexts for the script cache. Each context contains its name, the number of compilations, the number of cache evictions, and the number of times the script was limited by a circuit breaker. + +### `discovery` + +The `discovery` object contains the node discovery statistics and has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +cluster_state_queue | Object | Cluster state queue statistics for the node. +cluster_state_queue.total | Integer | The total number of cluster states in the queue. +cluster_state_queue.pending | Integer | The number of pending cluster states in the queue. +cluster_state_queue.committed | Integer | The number of committed cluster states in the queue. +published_cluster_states | Object | Statistics for the published cluster states for the node. +published_cluster_states.full_states | Integer | The number of published cluster states. +published_cluster_states.incompatible_diffs | Integer | The number of incompatible differences between published cluster states. +published_cluster_states.compatible_diffs | Integer | The number of compatible differences between published cluster states. +cluster_state_stats | Object | Cluster state update statistics published by the active leader. +cluster_state_stats.overall | Object | Overall cluster state update statistics. +cluster_state_stats.overall.update_count | Integer | The total number of successful cluster state updates. +cluster_state_stats.overall.total_time_in_millis | Integer | The total amount of time taken for all cluster state updates, in milliseconds. +cluster_state_stats.overall.failed_count | Integer | The total number of failed cluster state updates. +cluster_state_stats.remote_upload | Object | Cluster state update statistics related to remote uploads. +cluster_state_stats.remote_upload.success_count | Integer | The total number of successful cluster state updates uploaded to the remote store. +cluster_state_stats.remote_upload.failed_count | Integer | The total number of cluster state updates that failed to upload to the remote store. +cluster_state_stats.remote_upload.total_time_in_millis | Integer | The total amount of time taken for all cluster state updates uploaded to the remote store, in milliseconds. +cluster_state_stats.remote_upload.cleanup_attempt_failed_count | Integer | The total number of failures encountered while trying to clean up older cluster states from the remote store. + +### `ingest` + +The `ingest` object contains the ingest statistics and has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +total | Integer | Ingest statistics for the node's lifetime. +total.count | Integer | The total number of documents ingested by the node. +total.time_in_millis | Integer | The total amount of time for preprocessing ingest documents, in milliseconds. +total.current | Integer | The total number of documents that are currently being ingested by the node. +total.failed | Integer | The total number of failed ingestions for the node. +pipelines | Object | Ingest pipeline statistics for the node. Each pipeline is a nested object that is specified by its ID and has the following properties. +pipelines._id_.count | Integer | The number of documents preprocessed by the ingest pipeline. +pipelines._id_.time_in_millis | Integer | The total amount of time for preprocessing documents in the ingest pipeline, in milliseconds. +pipelines._id_.failed | Integer | The total number of failed ingestions for the ingest pipeline. +pipelines._id_.processors | Array of objects | Statistics for the ingest processors. Includes the number of documents that are currently transformed, the total number of transformed documents, the number of failed transformations, and the time spent transforming documents. + +### `search_pipeline` + +The `search_pipeline` object contains the statistics related to [search pipelines]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) and has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +total_request | Object | Cumulative statistics related to all search request processors. +total_request.count | Integer | The total number of search request processor executions. +total_request.time_in_millis | Integer | The total amount of time for all search request processor executions, in milliseconds. +total_request.current | Integer | The total number of search request processor executions currently in progress. +total_request.failed | Integer | The total number of failed search request processor executions. +total_response | Object | Cumulative statistics related to all search response processors. +total_response.count | Integer | The total number of search response processor executions. +total_response.time_in_millis | Integer | The total amount of time for all search response processor executions, in milliseconds. +total_response.current | Integer | The total number of search response processor executions currently in progress. +total_response.failed | Integer | The total number of failed search response processor executions. +pipelines | Object | Search pipeline statistics. Each pipeline is a nested object specified by its ID, with the properties listed in the following rows. If a processor has a `tag`, statistics for the processor are provided in the object with the name `:` (for example, `filter_query:abc`). Statistics for all processors of the same type that do not have a `tag` are aggregated and provided in the object with the name `` (for example, `filter_query`). +pipelines._id_.request.count | Integer | The number of search request processor executions performed by the search pipeline. +pipelines._id_.request.time_in_millis | Integer | The total amount of time for search request processor executions in the search pipeline, in milliseconds. +pipelines._id_.request.current | Integer | The number of search request processor executions currently in progress for the search pipeline. +pipelines._id_.request.failed | Integer | The number of failed search request processor executions for the search pipeline. +pipelines._id_.request_processors | Array of objects | Statistics for the search request processors. Includes the total number of executions, the total amount of time of executions, the total number of executions currently in progress, and the number of failed executions. +pipelines._id_.response.count | Integer | The number of search response processor executions performed by the search pipeline. +pipelines._id_.response.time_in_millis | Integer | The total amount of time for search response processor executions in the search pipeline, in milliseconds. +pipelines._id_.response.current | Integer | The number of search response processor executions currently in progress for the search pipeline. +pipelines._id_.response.failed | Integer | The number of failed search response processor executions for the search pipeline. +pipelines._id_.response_processors | Array of objects | Statistics for the search response processors. Includes the total number of executions, the total amount of time of executions, the total number of executions currently in progress, and the number of failed executions. + +### `adaptive_selection` + +The `adaptive_selection` object contains the adaptive selection statistics. Each entry is specified by the node ID and has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +outgoing_searches | Integer | The number of outgoing search requests for the node. +avg_queue_size | Integer | The rolling average queue size of search requests for the node (exponentially weighted). +avg_service_time_ns | Integer | The rolling average service time for search requests, in nanoseconds (exponentially weighted). +avg_response_time_ns | Integer | The rolling average response time for search requests, in nanoseconds (exponentially weighted). +rank | String | The node's rank that is used to select shards when routing requests. + +### `indexing_pressure` + +The `indexing_pressure` object contains the indexing pressure statistics and has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +memory | Object | Statistics related to memory consumption for the indexing load. +memory.current | Object | Statistics related to memory consumption for the current indexing load. +memory.current.combined_coordinating_and_primary_in_bytes | Integer | The total memory used by indexing requests in the coordinating or primary stages, in bytes. A node can reuse the coordinating memory if the primary stage is run locally, so the total memory does not necessarily equal the sum of the coordinating and primary stage memory usage. +memory.current.coordinating_in_bytes | The total memory consumed by indexing requests in the coordinating stage, in bytes. +memory.current.primary_in_bytes | Integer | The total memory consumed by indexing requests in the primary stage, in bytes. +memory.current.replica_in_bytes | Integer | The total memory consumed by indexing requests in the replica stage, in bytes. +memory.current.all_in_bytes | Integer | The total memory consumed by indexing requests in the coordinating, primary, or replica stages. + +### `shard_indexing_pressure` + +The `shard_indexing_pressure` object contains the [shard indexing pressure]({{site.url}}{{site.baseurl}}/opensearch/shard-indexing-backpressure) statistics and has the following properties. + +Field | Field type | Description +:--- | :--- | :--- +[stats]({{site.url}}{{site.baseurl}}/opensearch/stats-api/) | Object | Statistics about shard indexing pressure. +total_rejections_breakup_shadow_mode | Object | If running in shadow mode, the `total_rejections_breakup_shadow_mode` object contains statistics about the request rejection criteria of all shards in the node. +total_rejections_breakup_shadow_mode.node_limits | Integer | The total number of rejections due to the node memory limit. When all shards reach the memory limit assigned to the node (for example, 10% of heap size), the shard is unable to take in more traffic on the node, and the indexing request is rejected. +total_rejections_breakup_shadow_mode.no_successful_request_limits | Integer | The total number of rejections when the node occupancy level is breaching its soft limit and the shard has multiple outstanding requests that are waiting to be executed. In this case, additional indexing requests are rejected until the system recovers. +total_rejections_breakup_shadow_mode.throughput_degradation_limits | Integer | The total number of rejections when the node occupancy level is breaching its soft limit and there is a constant deterioration in the request turnaround at the shard level. In this case, additional indexing requests are rejected until the system recovers. +enabled | Boolean | Specifies whether the shard indexing pressure feature is turned on for the node. +enforced | Boolean | If true, the shard indexing pressure runs in enforced mode (there are rejections). If false, the shard indexing pressure runs in shadow mode (there are no rejections, but statistics are recorded and can be retrieved in the `total_rejections_breakup_shadow_mode` object). Only applicable if shard indexing pressure is enabled. + +### `resource_usage_stats` + +The `resource_usage_stats` object contains the resource usage statistics. Each entry is specified by the node ID and has the following properties. + +Field | Field type | Description +:--- |:-----------| :--- +timestamp | Integer | The last refresh time for the resource usage statistics, in milliseconds since the epoch. +cpu_utilization_percent | Float | Statistics for the average CPU usage of OpenSearch process within the time period configured in the `node.resource.tracker.global_cpu_usage.window_duration` setting. +memory_utilization_percent | Float | The node JVM memory usage statistics within the time period configured in the `node.resource.tracker.global_jvmmp.window_duration` setting. + +### `admission_control` + +The `admission_control` object contains the rejection count of search and indexing requests based on resource consumption and has the following properties. +Field | Field type | Description +:--- | :--- | :--- +admission_control.global_cpu_usage.transport.rejection_count.search | Integer | The total number of search rejections in the transport layer when the node CPU usage limit was breached. In this case, additional search requests are rejected until the system recovers. +admission_control.global_cpu_usage.transport.rejection_count.indexing | Integer | The total number of indexing rejections in the transport layer when the node CPU usage limit was breached. In this case, additional indexing requests are rejected until the system recovers. + +## Required permissions + +If you use the Security plugin, make sure you have the appropriate permissions: `cluster:monitor/nodes/stats`. diff --git a/_api-reference/nodes-apis/nodes-usage.md b/_api-reference/nodes-apis/nodes-usage.md new file mode 100644 index 00000000..532ddb62 --- /dev/null +++ b/_api-reference/nodes-apis/nodes-usage.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Nodes usage +parent: Nodes APIs +nav_order: 40 +--- + +# Nodes usage +**Introduced 1.0** +{: .label .label-purple } + +The nodes usage endpoint returns low-level information about REST action usage on nodes. + +## Path and HTTP methods + +``` +GET _nodes/usage +GET _nodes//usage +GET _nodes/usage/ +GET _nodes//usage/ +``` + +## Path parameters + +You can include the following optional path parameters in your request. + +Parameter | Type | Description +:--- | :--- | :--- +nodeId | String | A comma-separated list of nodeIds used to filter results. Supports [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters). Defaults to `_all`. +metric | String | The metrics that will be included in the response. You can set the string to either `_all` or `rest_actions`. `rest_actions` returns the total number of times an action has been called on the node. `_all` returns all stats from the node. Defaults to `_all`. + +## Query parameters + +You can include the following optional query parameters in your request. + +Parameter | Type | Description +:--- | :---| :--- +timeout | Time | Sets the time limit for a response from the node. Default is `30s`. +cluster_manager_timeout | Time | Sets the time limit for a response from the cluster manager. Default is `30s`. + +#### Example request + +The following request returns usage details for all nodes: + +``` +GET _nodes/usage +``` +{% include copy-curl.html %} + +#### Example response + +The following is an example response: + +```json +{ + "_nodes" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + }, + "cluster_name" : "opensearch-cluster", + "nodes" : { + "t7uqHu4SSuWObK3ElkCRfw" : { + "timestamp" : 1665695174312, + "since" : 1663994849643, + "rest_actions" : { + "opendistro_get_rollup_action" : 3, + "nodes_usage_action" : 1, + "list_dangling_indices" : 1, + "get_index_template_action" : 258, + "nodes_info_action" : 152665, + "get_mapping_action" : 259, + "get_data_streams_action" : 12, + "cat_indices_action" : 6, + "get_indices_action" : 3, + "ism_explain_action" : 7, + "nodes_reload_action" : 1, + "get_policy_action" : 3, + "PerformanceAnalyzerClusterConfigAction" : 2, + "index_policy_action" : 1, + "rank_eval_action" : 3, + "search_action" : 592, + "get_aliases_action" : 258, + "document_mget_action" : 2, + "document_get_action" : 30, + "count_action" : 1, + "main_action" : 1 + }, + "aggregations" : { } + } + } +} +``` + +## Required permissions + +If you use the Security plugin, make sure you set the following permissions: `cluster:manage/nodes` or `cluster:monitor/nodes`. \ No newline at end of file diff --git a/_opensearch/popular-api.md b/_api-reference/popular-api.md similarity index 87% rename from _opensearch/popular-api.md rename to _api-reference/popular-api.md index 28fdb077..2191b756 100644 --- a/_opensearch/popular-api.md +++ b/_api-reference/popular-api.md @@ -2,11 +2,15 @@ layout: default title: Popular APIs nav_order: 96 +redirect_from: + - /opensearch/popular-api/ --- # Popular APIs +**Introduced 1.0** +{: .label .label-purple } -This page contains sample requests for popular OpenSearch operations. +This page contains example requests for popular OpenSearch operations. --- @@ -77,14 +81,14 @@ POST _bulk ``` -## List all indices +## List all indexes ``` GET _cat/indices?v&expand_wildcards=all ``` -## Open or close all indices that match a pattern +## Open or close all indexes that match a pattern ``` POST my-logs*/_open @@ -92,7 +96,7 @@ POST my-logs*/_close ``` -## Delete all indices that match a pattern +## Delete all indexes that match a pattern ``` DELETE my-logs* @@ -115,7 +119,7 @@ GET _cat/aliases?v ``` -## Search an index or all indices that match a pattern +## Search an index or all indexes that match a pattern ``` GET my-logs/_search?q=test diff --git a/_api-reference/profile.md b/_api-reference/profile.md new file mode 100644 index 00000000..94c7857b --- /dev/null +++ b/_api-reference/profile.md @@ -0,0 +1,1004 @@ +--- +layout: default +title: Profile +nav_order: 55 +--- + +# Profile +**Introduced 1.0** +{: .label .label-purple } + +The Profile API provides timing information about the execution of individual components of a search request. Using the Profile API, you can debug slow requests and understand how to improve their performance. The Profile API does not measure the following: + +- Network latency +- Time spent in the search fetch phase +- Amount of time a request spends in queues +- Idle time while merging shard responses on the coordinating node + +The Profile API is a resource-consuming operation that adds overhead to search operations. +{: .warning} + +## Concurrent segment search + +Starting in OpenSearch 2.12, [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/) allows each shard-level request to search segments in parallel during the query phase. The Profile API response contains several additional fields with statistics about _slices_. + +A slice is the unit of work that can be executed by a thread. Each query can be partitioned into multiple slices, with each slice containing one or more segments. All the slices can be executed either in parallel or in some order depending on the available threads in the pool. + +In general, the max/min/avg slice time captures statistics across all slices for a timing type. For example, when profiling aggregations, the `max_slice_time_in_nanos` field in the `aggregations` section shows the maximum time consumed by the aggregation operation and its children across all slices. + +#### Example request: Non-concurrent search + +To use the Profile API, include the `profile` parameter set to `true` in the search request sent to the `_search` endpoint: + +```json +GET /testindex/_search +{ + "profile": true, + "query" : { + "match" : { "title" : "wind" } + } +} +``` +{% include copy-curl.html %} + +To turn on human-readable format, include the `?human=true` query parameter in the request: + +```json +GET /testindex/_search?human=true +{ + "profile": true, + "query" : { + "match" : { "title" : "wind" } + } +} +``` +{% include copy-curl.html %} + +The response contains an additional `time` field with human-readable units, for example: + +```json +"collector": [ + { + "name": "SimpleTopScoreDocCollector", + "reason": "search_top_hits", + "time": "113.7micros", + "time_in_nanos": 113711 + } +] +``` + +The Profile API response is verbose, so if you're running the request through the `curl` command, include the `?pretty` query parameter to make the response easier to understand. +{: .tip} + +#### Example response + +The response contains profiling information: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 21, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.19363807, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_score": 0.19363807, + "_source": { + "title": "The wind rises" + } + }, + { + "_index": "testindex", + "_id": "2", + "_score": 0.17225474, + "_source": { + "title": "Gone with the wind", + "description": "A 1939 American epic historical film" + } + } + ] + }, + "profile": { + "shards": [ + { + "id": "[LidyZ1HVS-u93-73Z49dQg][testindex][0]", + "inbound_network_time_in_millis": 0, + "outbound_network_time_in_millis": 0, + "searches": [ + { + "query": [ + { + "type": "BooleanQuery", + "description": "title:wind title:rise", + "time_in_nanos": 2473919, + "breakdown": { + "set_min_competitive_score_count": 0, + "match_count": 0, + "shallow_advance_count": 0, + "set_min_competitive_score": 0, + "next_doc": 5209, + "match": 0, + "next_doc_count": 2, + "score_count": 2, + "compute_max_score_count": 0, + "compute_max_score": 0, + "advance": 9209, + "advance_count": 2, + "score": 20751, + "build_scorer_count": 4, + "create_weight": 1404458, + "shallow_advance": 0, + "create_weight_count": 1, + "build_scorer": 1034292 + }, + "children": [ + { + "type": "TermQuery", + "description": "title:wind", + "time_in_nanos": 813581, + "breakdown": { + "set_min_competitive_score_count": 0, + "match_count": 0, + "shallow_advance_count": 0, + "set_min_competitive_score": 0, + "next_doc": 3291, + "match": 0, + "next_doc_count": 2, + "score_count": 2, + "compute_max_score_count": 0, + "compute_max_score": 0, + "advance": 7208, + "advance_count": 2, + "score": 18666, + "build_scorer_count": 6, + "create_weight": 616375, + "shallow_advance": 0, + "create_weight_count": 1, + "build_scorer": 168041 + } + }, + { + "type": "TermQuery", + "description": "title:rise", + "time_in_nanos": 191083, + "breakdown": { + "set_min_competitive_score_count": 0, + "match_count": 0, + "shallow_advance_count": 0, + "set_min_competitive_score": 0, + "next_doc": 0, + "match": 0, + "next_doc_count": 0, + "score_count": 0, + "compute_max_score_count": 0, + "compute_max_score": 0, + "advance": 0, + "advance_count": 0, + "score": 0, + "build_scorer_count": 2, + "create_weight": 188625, + "shallow_advance": 0, + "create_weight_count": 1, + "build_scorer": 2458 + } + } + ] + } + ], + "rewrite_time": 192417, + "collector": [ + { + "name": "SimpleTopScoreDocCollector", + "reason": "search_top_hits", + "time_in_nanos": 77291 + } + ] + } + ], + "aggregations": [] + } + ] + } +} +``` +
+ +#### Example response: Concurrent segment search + +The following is an example response for a concurrent segment search with three segment slices: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 5, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + ... + ] + }, + "aggregations": { + ... + }, + "profile": { + "shards": [ + { + "id": "[9Y7lbpaWRhyr5Y-41Zl48g][idx][0]", + "inbound_network_time_in_millis": 0, + "outbound_network_time_in_millis": 0, + "searches": [ + { + "query": [ + { + "type": "MatchAllDocsQuery", + "description": "*:*", + "time_in_nanos": 868000, + "max_slice_time_in_nanos": 19376, + "min_slice_time_in_nanos": 12250, + "avg_slice_time_in_nanos": 16847, + "breakdown": { + "max_match": 0, + "set_min_competitive_score_count": 0, + "match_count": 0, + "avg_score_count": 1, + "shallow_advance_count": 0, + "next_doc": 29708, + "min_build_scorer": 3125, + "score_count": 5, + "compute_max_score_count": 0, + "advance": 0, + "min_set_min_competitive_score": 0, + "min_advance": 0, + "score": 29250, + "avg_set_min_competitive_score_count": 0, + "min_match_count": 0, + "avg_score": 333, + "max_next_doc_count": 3, + "max_compute_max_score_count": 0, + "avg_shallow_advance": 0, + "max_shallow_advance_count": 0, + "set_min_competitive_score": 0, + "min_build_scorer_count": 2, + "next_doc_count": 8, + "min_match": 0, + "avg_next_doc": 888, + "compute_max_score": 0, + "min_set_min_competitive_score_count": 0, + "max_build_scorer": 5791, + "avg_match_count": 0, + "avg_advance": 0, + "build_scorer_count": 6, + "avg_build_scorer_count": 2, + "min_next_doc_count": 2, + "min_shallow_advance_count": 0, + "max_score_count": 2, + "avg_match": 0, + "avg_compute_max_score": 0, + "max_advance": 0, + "avg_shallow_advance_count": 0, + "avg_set_min_competitive_score": 0, + "avg_compute_max_score_count": 0, + "avg_build_scorer": 4027, + "max_set_min_competitive_score_count": 0, + "advance_count": 0, + "max_build_scorer_count": 2, + "shallow_advance": 0, + "min_compute_max_score": 0, + "max_match_count": 0, + "create_weight_count": 1, + "build_scorer": 32459, + "max_set_min_competitive_score": 0, + "max_compute_max_score": 0, + "min_shallow_advance": 0, + "match": 0, + "max_shallow_advance": 0, + "avg_advance_count": 0, + "min_next_doc": 708, + "max_advance_count": 0, + "min_score": 291, + "max_next_doc": 999, + "create_weight": 1834, + "avg_next_doc_count": 2, + "max_score": 376, + "min_compute_max_score_count": 0, + "min_score_count": 1, + "min_advance_count": 0 + } + } + ], + "rewrite_time": 8126, + "collector": [ + { + "name": "QueryCollectorManager", + "reason": "search_multi", + "time_in_nanos": 564708, + "reduce_time_in_nanos": 1251042, + "max_slice_time_in_nanos": 121959, + "min_slice_time_in_nanos": 28958, + "avg_slice_time_in_nanos": 83208, + "slice_count": 3, + "children": [ + { + "name": "SimpleTopDocsCollectorManager", + "reason": "search_top_hits", + "time_in_nanos": 500459, + "reduce_time_in_nanos": 840125, + "max_slice_time_in_nanos": 22168, + "min_slice_time_in_nanos": 5792, + "avg_slice_time_in_nanos": 12084, + "slice_count": 3 + }, + { + "name": "NonGlobalAggCollectorManager: [histo]", + "reason": "aggregation", + "time_in_nanos": 552167, + "reduce_time_in_nanos": 311292, + "max_slice_time_in_nanos": 95333, + "min_slice_time_in_nanos": 18416, + "avg_slice_time_in_nanos": 66249, + "slice_count": 3 + } + ] + } + ] + } + ], + "aggregations": [ + { + "type": "NumericHistogramAggregator", + "description": "histo", + "time_in_nanos": 2847834, + "max_slice_time_in_nanos": 117374, + "min_slice_time_in_nanos": 20624, + "avg_slice_time_in_nanos": 75597, + "breakdown": { + "min_build_leaf_collector": 9500, + "build_aggregation_count": 3, + "post_collection": 3209, + "max_collect_count": 2, + "initialize_count": 3, + "reduce_count": 0, + "avg_collect": 17055, + "max_build_aggregation": 26000, + "avg_collect_count": 1, + "max_build_leaf_collector": 64833, + "min_build_leaf_collector_count": 1, + "build_aggregation": 41125, + "min_initialize": 583, + "max_reduce": 0, + "build_leaf_collector_count": 3, + "avg_reduce": 0, + "min_collect_count": 1, + "avg_build_leaf_collector_count": 1, + "avg_build_leaf_collector": 45000, + "max_collect": 24625, + "reduce": 0, + "avg_build_aggregation": 12013, + "min_post_collection": 292, + "max_initialize": 1333, + "max_post_collection": 750, + "collect_count": 5, + "avg_post_collection": 541, + "avg_initialize": 986, + "post_collection_count": 3, + "build_leaf_collector": 86833, + "min_collect": 6250, + "min_build_aggregation": 3541, + "initialize": 2786791, + "max_build_leaf_collector_count": 1, + "min_reduce": 0, + "collect": 29834 + }, + "debug": { + "total_buckets": 1 + } + } + ] + } + ] + } +} +``` +
+ +## Response fields + +The response includes the following fields. + +Field | Data type | Description +:--- | :--- | :--- +`profile` | Object | Contains profiling information. +`profile.shards` | Array of objects | A search request can be executed against one or more shards in the index, and a search may involve one or more indexes. Thus, the `profile.shards` array contains profiling information for each shard that was involved in the search. +`profile.shards.id` | String | The shard ID of the shard in the `[node-ID][index-name][shard-ID]` format. +`profile.shards.searches` | Array of objects | A search represents a query executed against the underlying Lucene index. Most search requests execute a single search against a Lucene index, but some search requests can execute more than one search. For example, including a global aggregation results in a secondary `match_all` query for the global context. The `profile.shards` array contains profiling information about each search execution. +[`profile.shards.searches.query`](#the-query-array) | Array of objects | Profiling information about the query execution. +`profile.shards.searches.rewrite_time` | Integer | All Lucene queries are rewritten. A query and its children may be rewritten more than once, until the query stops changing. The rewriting process involves performing optimizations, such as removing redundant clauses or replacing a query path with a more efficient one. After the rewriting process, the original query may change significantly. The `rewrite_time` field contains the cumulative total rewrite time for the query and all its children, in nanoseconds. +[`profile.shards.searches.collector`](#the-collector-array) | Array of objects | Profiling information about the Lucene collectors that ran the search. +[`profile.shards.aggregations`](#aggregations) | Array of objects | Profiling information about the aggregation execution. + +### The `query` array + +The `query` array contains objects with the following fields. + +Field | Data type | Description +:--- | :--- | :--- +`type` | String | The Lucene query type into which the search query was rewritten. Corresponds to the Lucene class name (which often has the same name in OpenSearch). +`description` | String | Contains a Lucene explanation of the query. Helps differentiate queries with the same type. +`time_in_nanos` | Long | The total elapsed time for this query, in nanoseconds. For concurrent segment search, `time_in_nanos` is the total time spent across all the slices (the difference between the last completed slice execution end time and the first slice execution start time). +`max_slice_time_in_nanos` | Long | The maximum amount of time taken by any slice to run a query, in nanoseconds. This field is included only if you enable concurrent segment search. +`min_slice_time_in_nanos` | Long | The minimum amount of time taken by any slice to run a query, in nanoseconds. This field is included only if you enable concurrent segment search. +`avg_slice_time_in_nanos` | Long | The average amount of time taken by any slice to run a query, in nanoseconds. This field is included only if you enable concurrent segment search. +[`breakdown`](#the-breakdown-object) | Object | Contains timing statistics about low-level Lucene execution. +`children` | Array of objects | If a query has subqueries (children), this field contains information about the subqueries. + +### The `breakdown` object + +The `breakdown` object represents the timing statistics about low-level Lucene execution, broken down by method. Timings are listed in wall-clock nanoseconds and are not normalized. The `breakdown` timings are inclusive of all child times. The `breakdown` object comprises the following fields. All fields contain integer values. + +Field | Description +:--- | :--- +`create_weight` | A `Query` object in Lucene is immutable. Yet, Lucene should be able to reuse `Query` objects in multiple `IndexSearcher` objects. Thus, `Query` objects need to keep temporary state and statistics associated with the index in which the query is executed. To achieve reuse, every `Query` object generates a `Weight` object, which keeps the temporary context (state) associated with the `` tuple. The `create_weight` field contains the amount of time spent creating the `Weight` object. +`build_scorer` | A `Scorer` iterates over matching documents and generates a score for each document. The `build_scorer` field contains the amount of time spent generating the `Scorer` object. This does not include the time spent scoring the documents. The `Scorer` initialization time depends on the optimization and complexity of a particular query. The `build_scorer` parameter also includes the amount of time associated with caching, if caching is applicable and enabled for the query. +`next_doc` | The `next_doc` Lucene method returns the document ID of the next document that matches the query. This method is a special type of the `advance` method and is equivalent to `advance(docId() + 1)`. The `next_doc` method is more convenient for many Lucene queries. The `next_doc` field contains the amount of time required to determine the next matching document, which varies depending on the query type. +`advance` | The `advance` method is a lower-level version of the `next_doc` method in Lucene. It also finds the next matching document but necessitates that the calling query perform additional tasks, such as identifying skips. Some queries, such as conjunctions (`must` clauses in Boolean queries), cannot use `next_doc`. For those queries, `advance` is timed. +`match` | For some queries, document matching is performed in two steps. First, the document is matched approximately. Second, those documents that are approximately matched are examined through a more comprehensive process. For example, a phrase query first checks whether a document contains all terms in the phrase. Next, it verifies that the terms are in order (which is a more expensive process). The `match` field is non-zero only for those queries that use the two-step verification process. +`score` | Contains the time taken for a `Scorer` to score a particular document. +`shallow_advance` | Contains the amount of time required to execute the `advanceShallow` Lucene method. +`compute_max_score` | Contains the amount of time required to execute the `getMaxScore` Lucene method. +`set_min_competitive_score` | Contains the amount of time required to execute the `setMinCompetitiveScore` Lucene method. +`_count` | Contains the number of invocations of a ``. For example, `advance_count` contains the number of invocations of the `advance` method. Different invocations of the same method occur because the method is called on different documents. You can determine the selectivity of a query by comparing counts in different query components. +`max_` | The maximum amount of time taken by any slice to run a query method. Breakdown stats for the `create_weight` method do not include profiled `max` time because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. +`min_` | The minimum amount of time taken by any slice to run a query method. Breakdown stats for the `create_weight` method do not include profiled `min` time because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. +`avg_` | The average amount of time taken by any slice to run a query method. Breakdown stats for the `create_weight` method do not include profiled `avg` time because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. +`max__count` | The maximum number of invocations of a `` on any slice. Breakdown stats for the `create_weight` method do not include profiled `max` count because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. +`min__count` | The minimum number of invocations of a `` on any slice. Breakdown stats for the `create_weight` method do not include profiled `min` count because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. +`avg__count` | The average number of invocations of a `` on any slice. Breakdown stats for the `create_weight` method do not include profiled `avg` count because the method runs at the query level rather than the slice level. This field is included only if you enable concurrent segment search. + +### The `collector` array + +The `collector` array contains information about Lucene Collectors. A Collector is responsible for coordinating document traversal and scoring and collecting matching documents. Using Collectors, individual queries can record aggregation results and execute global queries or post-query filters. + +Field | Description +:--- | :--- +`name` | The collector name. In the [example response](#example-response), the `collector` is a single `SimpleTopScoreDocCollector`---the default scoring and sorting collector. +`reason` | Contains a description of the collector. For possible field values, see [Collector reasons](#collector-reasons). +`time_in_nanos` | The total elapsed time for this collector, in nanoseconds. For concurrent segment search, `time_in_nanos` is the total amount of time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`children` | If a collector has subcollectors (children), this field contains information about the subcollectors. +`max_slice_time_in_nanos` |The maximum amount of time taken by any slice, in nanoseconds. This field is included only if you enable concurrent segment search. +`min_slice_time_in_nanos` |The minimum amount of time taken by any slice, in nanoseconds. This field is included only if you enable concurrent segment search. +`avg_slice_time_in_nanos` |The average amount of time taken by any slice, in nanoseconds. This field is included only if you enable concurrent segment search. +`slice_count` |The total slice count for this query. This field is included only if you enable concurrent segment search. +`reduce_time_in_nanos` |The amount of time taken to reduce results for all slice collectors, in nanoseconds. This field is included only if you enable concurrent segment search. + +Collector times are calculated, combined, and normalized independently, so they are independent of query times. +{: .note} + +#### Collector reasons + +The following table describes all available collector reasons. + +Reason | Description +:--- | :--- +`search_sorted` | A collector that scores and sorts documents. Present in most simple searches. +`search_count` | A collector that counts the number of matching documents but does not fetch the source. Present when `size: 0` is specified. +`search_terminate_after_count` | A collector that searches for matching documents and terminates the search when it finds a specified number of documents. Present when the `terminate_after_count` query parameter is specified. +`search_min_score` | A collector that returns matching documents that have a score greater than a minimum score. Present when the `min_score` parameter is specified. +`search_multi` | A wrapper collector for other collectors. Present when search, aggregations, global aggregations, and post filters are combined in a single search. +`search_timeout` | A collector that stops running after a specified period of time. Present when a `timeout` parameter is specified. +`aggregation` | A collector for aggregations that is run against the specified query scope. OpenSearch uses a single `aggregation` collector to collect documents for all aggregations. +`global_aggregation` | A collector that is run against the global query scope. Global scope is different from a specified query scope, so in order to collect the entire dataset, a `match_all` query must be run. + +## Aggregations + +To profile aggregations, send an aggregation request and provide the `profile` parameter set to `true`. + +#### Example request: Global aggregation + +```json +GET /opensearch_dashboards_sample_data_ecommerce/_search +{ + "profile": "true", + "size": 0, + "query": { + "match": { "manufacturer": "Elitelligence" } + }, + "aggs": { + "all_products": { + "global": {}, + "aggs": { + "avg_price": { "avg": { "field": "taxful_total_price" } } + } + }, + "elitelligence_products": { "avg": { "field": "taxful_total_price" } } + } +} +``` +{% include copy-curl.html %} + +#### Example response: Global aggregation + +The response contains profiling information: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1370, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "all_products": { + "doc_count": 4675, + "avg_price": { + "value": 75.05542864304813 + } + }, + "elitelligence_products": { + "value": 68.4430200729927 + } + }, + "profile": { + "shards": [ + { + "id": "[LidyZ1HVS-u93-73Z49dQg][opensearch_dashboards_sample_data_ecommerce][0]", + "inbound_network_time_in_millis": 0, + "outbound_network_time_in_millis": 0, + "searches": [ + { + "query": [ + { + "type": "ConstantScoreQuery", + "description": "ConstantScore(manufacturer:elitelligence)", + "time_in_nanos": 1367487, + "breakdown": { + "set_min_competitive_score_count": 0, + "match_count": 0, + "shallow_advance_count": 0, + "set_min_competitive_score": 0, + "next_doc": 634321, + "match": 0, + "next_doc_count": 1370, + "score_count": 0, + "compute_max_score_count": 0, + "compute_max_score": 0, + "advance": 173250, + "advance_count": 2, + "score": 0, + "build_scorer_count": 4, + "create_weight": 132458, + "shallow_advance": 0, + "create_weight_count": 1, + "build_scorer": 427458 + }, + "children": [ + { + "type": "TermQuery", + "description": "manufacturer:elitelligence", + "time_in_nanos": 1174794, + "breakdown": { + "set_min_competitive_score_count": 0, + "match_count": 0, + "shallow_advance_count": 0, + "set_min_competitive_score": 0, + "next_doc": 470918, + "match": 0, + "next_doc_count": 1370, + "score_count": 0, + "compute_max_score_count": 0, + "compute_max_score": 0, + "advance": 172084, + "advance_count": 2, + "score": 0, + "build_scorer_count": 4, + "create_weight": 114041, + "shallow_advance": 0, + "create_weight_count": 1, + "build_scorer": 417751 + } + } + ] + } + ], + "rewrite_time": 42542, + "collector": [ + { + "name": "MultiCollector", + "reason": "search_multi", + "time_in_nanos": 778406, + "children": [ + { + "name": "EarlyTerminatingCollector", + "reason": "search_count", + "time_in_nanos": 70290 + }, + { + "name": "ProfilingAggregator: [elitelligence_products]", + "reason": "aggregation", + "time_in_nanos": 502780 + } + ] + } + ] + }, + { + "query": [ + { + "type": "ConstantScoreQuery", + "description": "ConstantScore(*:*)", + "time_in_nanos": 995345, + "breakdown": { + "set_min_competitive_score_count": 0, + "match_count": 0, + "shallow_advance_count": 0, + "set_min_competitive_score": 0, + "next_doc": 930803, + "match": 0, + "next_doc_count": 4675, + "score_count": 0, + "compute_max_score_count": 0, + "compute_max_score": 0, + "advance": 2209, + "advance_count": 2, + "score": 0, + "build_scorer_count": 4, + "create_weight": 23875, + "shallow_advance": 0, + "create_weight_count": 1, + "build_scorer": 38458 + }, + "children": [ + { + "type": "MatchAllDocsQuery", + "description": "*:*", + "time_in_nanos": 431375, + "breakdown": { + "set_min_competitive_score_count": 0, + "match_count": 0, + "shallow_advance_count": 0, + "set_min_competitive_score": 0, + "next_doc": 389875, + "match": 0, + "next_doc_count": 4675, + "score_count": 0, + "compute_max_score_count": 0, + "compute_max_score": 0, + "advance": 1167, + "advance_count": 2, + "score": 0, + "build_scorer_count": 4, + "create_weight": 9458, + "shallow_advance": 0, + "create_weight_count": 1, + "build_scorer": 30875 + } + } + ] + } + ], + "rewrite_time": 8792, + "collector": [ + { + "name": "ProfilingAggregator: [all_products]", + "reason": "aggregation_global", + "time_in_nanos": 1310536 + } + ] + } + ], + "aggregations": [ + { + "type": "AvgAggregator", + "description": "elitelligence_products", + "time_in_nanos": 319918, + "breakdown": { + "reduce": 0, + "post_collection_count": 1, + "build_leaf_collector": 130709, + "build_aggregation": 2709, + "build_aggregation_count": 1, + "build_leaf_collector_count": 2, + "post_collection": 584, + "initialize": 4750, + "initialize_count": 1, + "reduce_count": 0, + "collect": 181166, + "collect_count": 1370 + } + }, + { + "type": "GlobalAggregator", + "description": "all_products", + "time_in_nanos": 1519340, + "breakdown": { + "reduce": 0, + "post_collection_count": 1, + "build_leaf_collector": 134625, + "build_aggregation": 59291, + "build_aggregation_count": 1, + "build_leaf_collector_count": 2, + "post_collection": 5041, + "initialize": 24500, + "initialize_count": 1, + "reduce_count": 0, + "collect": 1295883, + "collect_count": 4675 + }, + "children": [ + { + "type": "AvgAggregator", + "description": "avg_price", + "time_in_nanos": 775967, + "breakdown": { + "reduce": 0, + "post_collection_count": 1, + "build_leaf_collector": 98999, + "build_aggregation": 33083, + "build_aggregation_count": 1, + "build_leaf_collector_count": 2, + "post_collection": 2209, + "initialize": 1708, + "initialize_count": 1, + "reduce_count": 0, + "collect": 639968, + "collect_count": 4675 + } + } + ] + } + ] + } + ] + } +} +``` +
+ +#### Example request: Non-global aggregation + +```json +GET /opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "avg_taxful_total_price": { + "avg": { + "field": "taxful_total_price" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response: Non-global aggregation + +The response contains profiling information: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 13, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "avg_taxful_total_price": { + "value": 75.05542864304813 + } + }, + "profile": { + "shards": [ + { + "id": "[LidyZ1HVS-u93-73Z49dQg][opensearch_dashboards_sample_data_ecommerce][0]", + "inbound_network_time_in_millis": 0, + "outbound_network_time_in_millis": 0, + "searches": [ + { + "query": [ + { + "type": "ConstantScoreQuery", + "description": "ConstantScore(*:*)", + "time_in_nanos": 1690820, + "breakdown": { + "set_min_competitive_score_count": 0, + "match_count": 0, + "shallow_advance_count": 0, + "set_min_competitive_score": 0, + "next_doc": 1614112, + "match": 0, + "next_doc_count": 4675, + "score_count": 0, + "compute_max_score_count": 0, + "compute_max_score": 0, + "advance": 2708, + "advance_count": 2, + "score": 0, + "build_scorer_count": 4, + "create_weight": 20250, + "shallow_advance": 0, + "create_weight_count": 1, + "build_scorer": 53750 + }, + "children": [ + { + "type": "MatchAllDocsQuery", + "description": "*:*", + "time_in_nanos": 770902, + "breakdown": { + "set_min_competitive_score_count": 0, + "match_count": 0, + "shallow_advance_count": 0, + "set_min_competitive_score": 0, + "next_doc": 721943, + "match": 0, + "next_doc_count": 4675, + "score_count": 0, + "compute_max_score_count": 0, + "compute_max_score": 0, + "advance": 1042, + "advance_count": 2, + "score": 0, + "build_scorer_count": 4, + "create_weight": 5041, + "shallow_advance": 0, + "create_weight_count": 1, + "build_scorer": 42876 + } + } + ] + } + ], + "rewrite_time": 22000, + "collector": [ + { + "name": "MultiCollector", + "reason": "search_multi", + "time_in_nanos": 3672676, + "children": [ + { + "name": "EarlyTerminatingCollector", + "reason": "search_count", + "time_in_nanos": 78626 + }, + { + "name": "ProfilingAggregator: [avg_taxful_total_price]", + "reason": "aggregation", + "time_in_nanos": 2834566 + } + ] + } + ] + } + ], + "aggregations": [ + { + "type": "AvgAggregator", + "description": "avg_taxful_total_price", + "time_in_nanos": 1973702, + "breakdown": { + "reduce": 0, + "post_collection_count": 1, + "build_leaf_collector": 199292, + "build_aggregation": 13584, + "build_aggregation_count": 1, + "build_leaf_collector_count": 2, + "post_collection": 6125, + "initialize": 6916, + "initialize_count": 1, + "reduce_count": 0, + "collect": 1747785, + "collect_count": 4675 + } + } + ] + } + ] + } +} +``` +
+ +### Response fields + +The `aggregations` array contains aggregation objects with the following fields. + +Field | Data type | Description +:--- | :--- | :--- +`type` | String | The aggregator type. In the [non-global aggregation example response](#example-response-non-global-aggregation), the aggregator type is `AvgAggregator`. [Global aggregation example response](#example-request-global-aggregation) contains a `GlobalAggregator` with an `AvgAggregator` child. +`description` | String | Contains a Lucene explanation of the aggregation. Helps differentiate aggregations with the same type. +`time_in_nanos` | Long | The total elapsed time for this aggregation, in nanoseconds. For concurrent segment search, `time_in_nanos` is the total amount of time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +[`breakdown`](#the-breakdown-object-1) | Object | Contains timing statistics about low-level Lucene execution. +`children` | Array of objects | If an aggregation has subaggregations (children), this field contains information about the subaggregations. +`debug` | Object | Some aggregations return a `debug` object that describes the details of the underlying execution. +`max_slice_time_in_nanos` |Long | The maximum amount of time taken by any slice to run an aggregation, in nanoseconds. This field is included only if you enable concurrent segment search. +`min_slice_time_in_nanos` |Long |The minimum amount of time taken by any slice to run an aggregation, in nanoseconds. This field is included only if you enable concurrent segment search. +`avg_slice_time_in_nanos` |Long |The average amount of time taken by any slice to run an aggregation, in nanoseconds. This field is included only if you enable concurrent segment search. + +### The `breakdown` object + +The `breakdown` object represents the timing statistics about low-level Lucene execution, broken down by method. Each field in the `breakdown` object represents an internal Lucene method executed within the aggregation. Timings are listed in wall-clock nanoseconds and are not normalized. The `breakdown` timings are inclusive of all child times. The `breakdown` object is comprised of the following fields. All fields contain integer values. + +Field | Description +:--- | :--- +`initialize` | Contains the amount of time taken to execute the `preCollection()` callback method during `AggregationCollectorManager` creation. For concurrent segment search, the `initialize` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`build_leaf_collector`| Contains the time spent running the aggregation's `getLeafCollector()` method, which creates a new collector to collect the given context. For concurrent segment search, the `build_leaf_collector` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`collect`| Contains the time spent collecting the documents into buckets. For concurrent segment search, the `collect` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`post_collection`| Contains the time spent running the aggregation’s `postCollection()` callback method. For concurrent segment search, the `post_collection` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`build_aggregation`| Contains the time spent running the aggregation’s `buildAggregations()` method, which builds the results of this aggregation. For concurrent segment search, the `build_aggregation` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`reduce`| Contains the time spent in the `reduce` phase. For concurrent segment search, the `reduce` method contains the total elapsed time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). +`_count` | Contains the number of invocations of a ``. For example, `build_leaf_collector_count` contains the number of invocations of the `build_leaf_collector` method. +`max_` |The maximum amount of time taken by any slice to run an aggregation method. This field is included only if you enable concurrent segment search. +`min_`|The minimum amount of time taken by any slice to run an aggregation method. This field is included only if you enable concurrent segment search. +`avg_` |The average amount of time taken by any slice to run an aggregation method. This field is included only if you enable concurrent segment search. +`_count` |The total method count across all slices. For example, for the `collect` method, it is the total number of invocations of this method needed to collect documents into buckets across all slices. +`max__count` |The maximum number of invocations of a `` on any slice. This field is included only if you enable concurrent segment search. +`min__count` |The minimum number of invocations of a `` on any slice. This field is included only if you enable concurrent segment search. +`avg__count` |The average number of invocations of a `` on any slice. This field is included only if you enable concurrent segment search. diff --git a/_api-reference/rank-eval.md b/_api-reference/rank-eval.md new file mode 100644 index 00000000..04fd3cf5 --- /dev/null +++ b/_api-reference/rank-eval.md @@ -0,0 +1,117 @@ +--- +layout: default +title: Ranking evaluation +nav_order: 60 +--- + +# Ranking evaluation +**Introduced 1.0** +{: .label .label-purple } + +The [rank]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/rank/) eval endpoint allows you to evaluate the quality of ranked search results. + +## Path and HTTP methods + +``` +GET /_rank_eval +POST /_rank_eval +``` + +## Query parameters + +Query parameters are optional. + +Parameter | Data type | Description +:--- | :--- | :--- +ignore_unavailable | Boolean | Defaults to `false`. When set to `false` the response body will return an error if an index is closed or missing. +allow_no_indices | Boolean | Defaults to `true`. When set to `false` the response body will return an error if a wildcard expression points to indexes that are closed or missing. +expand_wildcards | String | Expand wildcard expressions for indexes that are `open`, `closed`, `hidden`, `none`, or `all`. +search_type | String | Set search type to either `query_then_fetch` or `dfs_query_then_fetch`. + +## Request fields + +The request body must contain at least one parameter. + +Field Type | Description +:--- | :--- +id | Document or template ID. +requests | Set multiple search requests within the request field section. +ratings | Document relevance score. +k | The number of documents returned per query. Default is set to 10. +relevant_rating_threshold | The threshold at which documents are considered relevant. Default is set to 1. +normalize | Discounted cumulative gain will be calculated when set to `true`. +maximum_relevance | Sets the maximum relevance score when using the expected reciprocal rank metric. +ignore_unlabeled | Defaults to `false`. Unlabeled documents are ignored when set to `true`. +template_id | Template ID. +params | Parameters used in the template. + +#### Example request + +````json +GET shakespeare/_rank_eval +{ + "requests": [ + { + "id": "books_query", + "request": { + "query": { "match": { "text": "thou" } } + }, + "ratings": [ + { "_index": "shakespeare", "_id": "80", "rating": 0 }, + { "_index": "shakespeare", "_id": "115", "rating": 1 }, + { "_index": "shakespeare", "_id": "117", "rating": 2 } + ] + }, + { + "id": "words_query", + "request": { + "query": { "match": { "text": "art" } } + }, + "ratings": [ + { "_index": "shakespeare", "_id": "115", "rating": 2 } + ] + } + ] +} +```` +{% include copy-curl.html %} + +#### Example response + +````json +{ + "rank_eval": { + "metric_score": 0.7, + "details": { + "query_1": { + "metric_score": 0.9, + "unrated_docs": [ + { + "_index": "shakespeare", + "_id": "1234567" + }, ... + ], + "hits": [ + { + "hit": { + "_index": "shakespeare", + "_type": "page", + "_id": "1234567", + "_score": 5.123456789 + }, + "rating": 1 + }, ... + ], + "metric_details": { + "precision": { + "relevant_docs_retrieved": 3, + "docs_retrieved": 6 + } + } + }, + "query_2": { [... ] } + }, + "failures": { [... ] } + } +} +```` \ No newline at end of file diff --git a/_opensearch/rest-api/remote-info.md b/_api-reference/remote-info.md similarity index 87% rename from _opensearch/rest-api/remote-info.md rename to _api-reference/remote-info.md index e9d88402..ac2971f2 100644 --- a/_opensearch/rest-api/remote-info.md +++ b/_api-reference/remote-info.md @@ -1,12 +1,13 @@ --- layout: default title: Remote cluster information -parent: REST API reference -nav_order: 25 +nav_order: 67 +redirect_from: + - /opensearch/rest-api/remote-info/ --- # Remote cluster information -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } This operation provides connection information for any remote OpenSearch clusters that you've configured for the local cluster, such as the remote cluster alias, connection mode (`sniff` or `proxy`), IP addresses for seed nodes, and timeout settings. @@ -19,7 +20,7 @@ The response is more comprehensive and useful than a call to `_cluster/settings` ``` GET _remote/info ``` - +{% include copy-curl.html %} ## Response diff --git a/_api-reference/script-apis/create-stored-script.md b/_api-reference/script-apis/create-stored-script.md new file mode 100644 index 00000000..04a73a20 --- /dev/null +++ b/_api-reference/script-apis/create-stored-script.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Create or Update Stored Script +parent: Script APIs +nav_order: 1 +--- + +# Create or update stored script +**Introduced 1.0** +{: .label .label-purple } + +Creates or updates a stored script or search template. + +For additional information about Painless scripting, see: + +* [k-NN Painless Scripting extensions]({{site.url}}{{site.baseurl}}/search-plugins/knn/painless-functions/). + +* [k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/). + + +## Path parameters + +| Parameter | Data type | Description | +:--- | :--- | :--- +| script-id | String | Stored script or search template ID. Must be unique across the cluster. Required. | + +## Query parameters + +All parameters are optional. + +| Parameter | Data type | Description | +:--- | :--- | :--- +| context | String | Context in which the script or search template is to run. To prevent errors, the API immediately compiles the script or template in this context. | +| cluster_manager_timeout | Time | Amount of time to wait for a connection to the cluster manager. Defaults to 30 seconds. | +| timeout | Time | The period of time to wait for a response. If a response is not received before the timeout value, the request fails and returns an error. Defaults to 30 seconds.| + +## Request fields + +| Field | Data type | Description | +:--- | :--- | :--- +| script | Object | Defines the script or search template, its parameters, and its language. See *Script object* below. | + +*Script object* + +| Field | Data type | Description | +:--- | :--- | :--- +| lang | String | Scripting language. Required. | +| source | String or Object | Required.

For scripts, a string with the contents of the script.

For search templates, an object that defines the search template. Supports the same parameters as the [Search]({{site.url}}{{site.baseurl}}/api-reference/search) API request body. Search templates also support Mustache variables. | + +#### Example request + +The sample uses an index called `books` with the following documents: + +````json +{"index":{"_id":1}} +{"name":"book1","author":"Faustine","ratings":[4,3,5]} +{"index":{"_id":2}} +{"name":"book2","author":"Amit","ratings":[5,5,5]} +{"index":{"_id":3}} +{"name":"book3","author":"Gilroy","ratings":[2,1,5]} +```` + +The following request creates the Painless script `my-first-script`. It sums the ratings for each book and displays the sum in the output. + +````json +PUT _scripts/my-first-script +{ + "script": { + "lang": "painless", + "source": """ + int total = 0; + for (int i = 0; i < doc['ratings'].length; ++i) { + total += doc['ratings'][i]; + } + return total; + """ + } +} +```` +{% include copy.html %} + +The example above uses the syntax of the Dev Tools console in OpenSearch Dashboards. You can also use a curl request. +{: .note } + +The following curl request is equivalent to the previous Dashboards console example: + +````json +curl -XPUT "http://opensearch:9200/_scripts/my-first-script" -H 'Content-Type: application/json' -d' +{ + "script": { + "lang": "painless", + "source": "\n int total = 0;\n for (int i = 0; i < doc['\''ratings'\''].length; ++i) {\n total += doc['\''ratings'\''][i];\n }\n return total;\n " + } +}' +```` +{% include copy.html %} + + +The following request creates the Painless script `my-first-script`, which sums the ratings for each book and displays the sum in the output: + +````json +PUT _scripts/my-first-script +{ + "script": { + "lang": "painless", + "source": """ + int total = 0; + for (int i = 0; i < doc['ratings'].length; ++i) { + total += doc['ratings'][i]; + } + return total; + """ + } +} +```` +{% include copy-curl.html %} + +See [Execute Painless stored script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/exec-stored-script/) for information about running the script. + +#### Example response + +The `PUT _scripts/my-first-script` request returns the following field: + +````json +{ + "acknowledged" : true +} +```` + +To determine whether the script was successfully created, use the [Get stored script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/get-stored-script/) API, passing the script name as the `script` path parameter. +{: .note} + +### Response fields + +| Field | Data type | Description | +:--- | :--- | :--- +| acknowledged | Boolean | Whether the request was received. | + +## Creating or updating a stored script with parameters + +The Painless script supports `params` to pass variables to the script. + +#### Example + +The following request creates the Painless script `multiplier-script`. The request sums the ratings for each book, multiplies the summed value by the `multiplier` parameter, and displays the result in the output: + +````json +PUT _scripts/multiplier-script +{ + "script": { + "lang": "painless", + "source": """ + int total = 0; + for (int i = 0; i < doc['ratings'].length; ++i) { + total += doc['ratings'][i]; + } + return total * params['multiplier']; + """ + } +} +```` +{% include copy-curl.html %} + +### Example response + +The `PUT _scripts/multiplier-script` request returns the following field: + +````json +{ + "acknowledged" : true +} +```` \ No newline at end of file diff --git a/_api-reference/script-apis/delete-script.md b/_api-reference/script-apis/delete-script.md new file mode 100644 index 00000000..363b0152 --- /dev/null +++ b/_api-reference/script-apis/delete-script.md @@ -0,0 +1,56 @@ +--- +layout: default +title: Delete Script +parent: Script APIs +nav_order: 4 +--- + +# Delete script +**Introduced 1.0** +{: .label .label-purple } + +Deletes a stored script + +## Path parameters + +Path parameters are optional. + +| Parameter | Data type | Description | +:--- | :--- | :--- +| script-id | String | ID of script to delete. | + +## Query parameters + +| Parameter | Data type | Description | +:--- | :--- | :--- +| cluster_manager_timeout | Time | Amount of time to wait for a connection to the cluster manager. Optional, defaults to `30s`. | +| timeout | Time | The period of time to wait for a response. If a response is not received before the timeout value, the request will be dropped. + +#### Example request + +The following request deletes the `my-first-script` script: + +````json +DELETE _scripts/my-script +```` +{% include copy-curl.html %} + +#### Example response + +The `DELETE _scripts/my-first-script` request returns the following field: + +````json +{ + "acknowledged" : true +} +```` + +To determine whether the stored script was successfully deleted, use the [Get stored script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/get-stored-script/) API, passing the script name as the `script` path parameter. + +## Response fields + +The request returns the following response fields: + +| Field | Data type | Description | +:--- | :--- | :--- +| acknowledged | Boolean | Whether the delete script request was received. | \ No newline at end of file diff --git a/_api-reference/script-apis/exec-script.md b/_api-reference/script-apis/exec-script.md new file mode 100644 index 00000000..4ecb6a37 --- /dev/null +++ b/_api-reference/script-apis/exec-script.md @@ -0,0 +1,189 @@ +--- +layout: default +title: Execute Painless script +parent: Script APIs +nav_order: 7 +--- + +# Execute Painless script +**Introduced 1.0** +{: .label .label-purple } + +The Execute Painless script API allows you to run a script that is not stored. + +## Path and HTTP methods + +```json +GET /_scripts/painless/_execute +POST /_scripts/painless/_execute +``` + +## Request fields + +| Field | Description | +:--- | :--- +| script | The script to run. Required| +| context | A context for the script. Optional. Default is `painless_test`. | +| context_setup | Specifies additional parameters for the context. Optional.| + +#### Example request + +The following request uses the default `painless_context` for the script: + +```json +GET /_scripts/painless/_execute +{ + "script": { + "source": "(params.x + params.y)/ 2", + "params": { + "x": 80, + "y": 100 + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +The response contains the average of two script parameters: + +```json +{ + "result" : "90" +} +``` + +## Response fields + +| Field | Description | +:--- | :--- +| result | The script result.| + + +## Script contexts + +Choose different contexts to control the variables that are available to the script and the result's return type. The default context is `painless_test`. + +## Painless test context + +The `painless_test` context is the default script context that provides only the `params` variable to the script. The returned result is always converted to a string. See the preceding example request for a usage example. + +## Filter context + +The `filter` context runs the script as if the script were inside a script query. You must provide a test document in the context. The `_source`, stored fields, and `_doc` variables will be available to the script. + +You can specify the following parameters for the filter context in the `context_setup`. + +Parameter | Description +:--- | :--- +document | The document that is indexed in memory temporarily and available to the script. +index | The name of the index that contains a mapping for the document. + +For example, first create an index with a mapping for a test document: + +```json +PUT /testindex1 +{ + "mappings": { + "properties": { + "grad": { + "type": "boolean" + }, + "gpa": { + "type": "float" + } + } + } +} +``` +{% include copy-curl.html %} + +Run a script to determine if a student is eligible to graduate with honors: + +```json +POST /_scripts/painless/_execute +{ + "script": { + "source": "doc['grad'].value == true && doc['gpa'].value >= params.min_honors_gpa", + "params": { + "min_honors_gpa": 3.5 + } + }, + "context": "filter", + "context_setup": { + "index": "testindex1", + "document": { + "grad": true, + "gpa": 3.79 + } + } +} +``` +{% include copy-curl.html %} + +The response contains the result: + +```json +{ + "result" : true +} +``` + +## Score context + +The `score` context runs a script as if the script were in a `script_score` function in a `function_score` query. + +You can specify the following parameters for the score context in the `context_setup`. + +Parameter | Description +:--- | :--- +document | The document that is indexed in memory temporarily and available to the script. +index | The name of the index that contains a mapping for the document. +query | If the script uses the `_score` parameter, the query can specify to use the `_score` field to compute the score. + +For example, first create an index with a mapping for a test document: + +```json +PUT /testindex1 +{ + "mappings": { + "properties": { + "gpa_4_0": { + "type": "float" + } + } + } +} +``` +{% include copy-curl.html %} + +Run a script that converts a GPA on a 4.0 scale into a different scale that is provided as a parameter: + +```json +POST /_scripts/painless/_execute +{ + "script": { + "source": "doc['gpa_4_0'].value * params.max_gpa / 4.0", + "params": { + "max_gpa": 5.0 + } + }, + "context": "score", + "context_setup": { + "index": "testindex1", + "document": { + "gpa_4_0": 3.5 + } + } +} +``` +{% include copy-curl.html %} + +The response contains the result: + +```json +{ + "result" : 4.375 +} +``` \ No newline at end of file diff --git a/_api-reference/script-apis/exec-stored-script.md b/_api-reference/script-apis/exec-stored-script.md new file mode 100644 index 00000000..7525ec81 --- /dev/null +++ b/_api-reference/script-apis/exec-stored-script.md @@ -0,0 +1,326 @@ +--- +layout: default +title: Execute Painless stored script +parent: Script APIs +nav_order: 2 +--- + +# Execute Painless stored script +**Introduced 1.0** +{: .label .label-purple } + +Runs a stored script written in the Painless language. + +OpenSearch provides several ways to run a script; the following sections show how to run a script by passing script information in the request body of a `GET /_search` request. + +## Request fields + +| Field | Data type | Description | +:--- | :--- | :--- +| query | Object | A filter that specifies documents to process. | +| script_fields | Object | Fields to include in output. | +| script | Object | ID of the script that produces a value for a field. | + +#### Example request + +The following request runs the stored script that was created in [Create or update stored script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/create-stored-script/). The script sums the ratings for each book and displays the sum in the `total_ratings` field in the output. + +* The script's target is the `books` index. + +* The `"match_all": {}` property value is an empty object indicating to process each document in the index. + +* The `total_ratings` field value is the result of the `my-first-script` execution. See [Create or update stored script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/create-stored-script/). + +````json +GET books/_search +{ + "query": { + "match_all": {} + }, + "script_fields": { + "total_ratings": { + "script": { + "id": "my-first-script" + } + } + } +} +```` +{% include copy-curl.html %} + +#### Example response + +The `GET books/_search` request returns the following fields: + +````json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "books", + "_id" : "1", + "_score" : 1.0, + "fields" : { + "total_ratings" : [ + 12 + ] + } + }, + { + "_index" : "books", + "_id" : "2", + "_score" : 1.0, + "fields" : { + "total_ratings" : [ + 15 + ] + } + }, + { + "_index" : "books", + "_id" : "3", + "_score" : 1.0, + "fields" : { + "total_ratings" : [ + 8 + ] + } + } + ] + } +} +```` + +## Response fields + +| Field | Data type | Description | +:--- | :--- | :--- +| took | Integer | How long the operation took in milliseconds. | +| timed_out | Boolean | Whether the operation timed out. | +| _shards | Object | Total number of shards processed and also the total number of successful, skipped, and not processed. | +| hits | Object | Contains high-level information about the documents processed and an array of `hits` objects. See [Hits object](#hits-object). | + +#### Hits object + +| Field | Data type | Description | +:--- | :--- | :--- +| total | Object | Total number of documents processed and their relationship to the `match` request field. | +| max_score | Double | Highest relevance score returned from all the hits. | +| hits | Array | Information about each document that was processed. See [Document object](#Document-object). | + +#### Document object + +| Field | Data type | Description | +:--- | :--- | :--- +| _index | String | Index that contains the document. | +| _id | String | Document ID. | +| _score | Float | Document's relevance score. | +| fields | Object | Fields and their value returned from the script. | + +## Running a Painless stored script with parameters + +To pass different parameters to the script each time when running a query, define `params` in `script_fields`. + +#### Example + +The following request runs the stored script that was created in [Create or update stored script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/create-stored-script/). The script sums the ratings for each book, multiplies the summed value by the `multiplier` parameter, and displays the result in the output. + +* The script's target is the `books` index. + +* The `"match_all": {}` property value is an empty object, indicating that it processes each document in the index. + +* The `total_ratings` field value is the result of the `multiplier-script` execution. See [Creating or updating a stored script with parameters]({{site.url}}{{site.baseurl}}/api-reference/script-apis/create-stored-script/). + +* `"multiplier": 2` in the `params` field is a variable passed to the stored script `multiplier-script`: + +```json +GET books/_search +{ + "query": { + "match_all": {} + }, + "script_fields": { + "total_ratings": { + "script": { + "id": "multiplier-script", + "params": { + "multiplier": 2 + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took" : 12, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "books", + "_type" : "_doc", + "_id" : "3", + "_score" : 1.0, + "fields" : { + "total_ratings" : [ + 16 + ] + } + }, + { + "_index" : "books", + "_type" : "_doc", + "_id" : "2", + "_score" : 1.0, + "fields" : { + "total_ratings" : [ + 30 + ] + } + }, + { + "_index" : "books", + "_type" : "_doc", + "_id" : "1", + "_score" : 1.0, + "fields" : { + "total_ratings" : [ + 24 + ] + } + } + ] + } +} +``` + +**Sort results using painless stored script +You can use painless stored script to sort results.** + +#### Sample request + +```json +GET books/_search +{ + "query": { + "match_all": {} + }, + "script_fields": { + "total_ratings": { + "script": { + "id": "multiplier-script", + "params": { + "multiplier": 2 + } + } + } + }, + "sort": { + "_script": { + "type": "number", + "script": { + "id": "multiplier-script", + "params": { + "multiplier": 2 + } + }, + "order": "desc" + } + } +} +``` + +#### Sample response + +```json +{ + "took" : 90, + "timed_out" : false, + "_shards" : { + "total" : 5, + "successful" : 5, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ + { + "_index" : "books", + "_type" : "_doc", + "_id" : "2", + "_score" : null, + "fields" : { + "total_ratings" : [ + 30 + ] + }, + "sort" : [ + 30.0 + ] + }, + { + "_index" : "books", + "_type" : "_doc", + "_id" : "1", + "_score" : null, + "fields" : { + "total_ratings" : [ + 24 + ] + }, + "sort" : [ + 24.0 + ] + }, + { + "_index" : "books", + "_type" : "_doc", + "_id" : "3", + "_score" : null, + "fields" : { + "total_ratings" : [ + 16 + ] + }, + "sort" : [ + 16.0 + ] + } + ] + } +} +``` diff --git a/_api-reference/script-apis/get-script-contexts.md b/_api-reference/script-apis/get-script-contexts.md new file mode 100644 index 00000000..40a15595 --- /dev/null +++ b/_api-reference/script-apis/get-script-contexts.md @@ -0,0 +1,578 @@ +--- +layout: default +title: Get Stored Script Contexts +parent: Script APIs +nav_order: 5 +--- + +# Get stored script contexts +**Introduced 1.0** +{: .label .label-purple } + +Retrieves all contexts for stored scripts. + +#### Example request + +````json +GET _script_context +```` +{% include copy-curl.html %} + +#### Example response + +The `GET _script_context` request returns the following fields: + +````json +{ + "contexts" : [ + { + "name" : "aggregation_selector", + "methods" : [ + { + "name" : "execute", + "return_type" : "boolean", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "aggs", + "methods" : [ + { + "name" : "execute", + "return_type" : "java.lang.Object", + "params" : [ ] + }, + { + "name" : "getDoc", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "get_score", + "return_type" : "java.lang.Number", + "params" : [ ] + }, + { + "name" : "get_value", + "return_type" : "java.lang.Object", + "params" : [ ] + } + ] + }, + { + "name" : "aggs_combine", + "methods" : [ + { + "name" : "execute", + "return_type" : "java.lang.Object", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getState", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "aggs_init", + "methods" : [ + { + "name" : "execute", + "return_type" : "void", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getState", + "return_type" : "java.lang.Object", + "params" : [ ] + } + ] + }, + { + "name" : "aggs_map", + "methods" : [ + { + "name" : "execute", + "return_type" : "void", + "params" : [ ] + }, + { + "name" : "getDoc", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getState", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "get_score", + "return_type" : "double", + "params" : [ ] + } + ] + }, + { + "name" : "aggs_reduce", + "methods" : [ + { + "name" : "execute", + "return_type" : "java.lang.Object", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getStates", + "return_type" : "java.util.List", + "params" : [ ] + } + ] + }, + { + "name" : "analysis", + "methods" : [ + { + "name" : "execute", + "return_type" : "boolean", + "params" : [ + { + "type" : "org.opensearch.analysis.common.AnalysisPredicateScript$Token", + "name" : "token" + } + ] + } + ] + }, + { + "name" : "bucket_aggregation", + "methods" : [ + { + "name" : "execute", + "return_type" : "java.lang.Number", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "field", + "methods" : [ + { + "name" : "execute", + "return_type" : "java.lang.Object", + "params" : [ ] + }, + { + "name" : "getDoc", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "filter", + "methods" : [ + { + "name" : "execute", + "return_type" : "boolean", + "params" : [ ] + }, + { + "name" : "getDoc", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "ingest", + "methods" : [ + { + "name" : "execute", + "return_type" : "void", + "params" : [ + { + "type" : "java.util.Map", + "name" : "ctx" + } + ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "interval", + "methods" : [ + { + "name" : "execute", + "return_type" : "boolean", + "params" : [ + { + "type" : "org.opensearch.index.query.IntervalFilterScript$Interval", + "name" : "interval" + } + ] + } + ] + }, + { + "name" : "moving-function", + "methods" : [ + { + "name" : "execute", + "return_type" : "double", + "params" : [ + { + "type" : "java.util.Map", + "name" : "params" + }, + { + "type" : "double[]", + "name" : "values" + } + ] + } + ] + }, + { + "name" : "number_sort", + "methods" : [ + { + "name" : "execute", + "return_type" : "double", + "params" : [ ] + }, + { + "name" : "getDoc", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "get_score", + "return_type" : "double", + "params" : [ ] + } + ] + }, + { + "name" : "painless_test", + "methods" : [ + { + "name" : "execute", + "return_type" : "java.lang.Object", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "processor_conditional", + "methods" : [ + { + "name" : "execute", + "return_type" : "boolean", + "params" : [ + { + "type" : "java.util.Map", + "name" : "ctx" + } + ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "score", + "methods" : [ + { + "name" : "execute", + "return_type" : "double", + "params" : [ + { + "type" : "org.opensearch.script.ScoreScript$ExplanationHolder", + "name" : "explanation" + } + ] + }, + { + "name" : "getDoc", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "get_score", + "return_type" : "double", + "params" : [ ] + } + ] + }, + { + "name" : "script_heuristic", + "methods" : [ + { + "name" : "execute", + "return_type" : "double", + "params" : [ + { + "type" : "java.util.Map", + "name" : "params" + } + ] + } + ] + }, + { + "name" : "similarity", + "methods" : [ + { + "name" : "execute", + "return_type" : "double", + "params" : [ + { + "type" : "double", + "name" : "weight" + }, + { + "type" : "org.opensearch.index.similarity.ScriptedSimilarity$Query", + "name" : "query" + }, + { + "type" : "org.opensearch.index.similarity.ScriptedSimilarity$Field", + "name" : "field" + }, + { + "type" : "org.opensearch.index.similarity.ScriptedSimilarity$Term", + "name" : "term" + }, + { + "type" : "org.opensearch.index.similarity.ScriptedSimilarity$Doc", + "name" : "doc" + } + ] + } + ] + }, + { + "name" : "similarity_weight", + "methods" : [ + { + "name" : "execute", + "return_type" : "double", + "params" : [ + { + "type" : "org.opensearch.index.similarity.ScriptedSimilarity$Query", + "name" : "query" + }, + { + "type" : "org.opensearch.index.similarity.ScriptedSimilarity$Field", + "name" : "field" + }, + { + "type" : "org.opensearch.index.similarity.ScriptedSimilarity$Term", + "name" : "term" + } + ] + } + ] + }, + { + "name" : "string_sort", + "methods" : [ + { + "name" : "execute", + "return_type" : "java.lang.String", + "params" : [ ] + }, + { + "name" : "getDoc", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "get_score", + "return_type" : "double", + "params" : [ ] + } + ] + }, + { + "name" : "template", + "methods" : [ + { + "name" : "execute", + "return_type" : "java.lang.String", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "terms_set", + "methods" : [ + { + "name" : "execute", + "return_type" : "java.lang.Number", + "params" : [ ] + }, + { + "name" : "getDoc", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "trigger", + "methods" : [ + { + "name" : "execute", + "return_type" : "boolean", + "params" : [ + { + "type" : "org.opensearch.alerting.script.QueryLevelTriggerExecutionContext", + "name" : "ctx" + } + ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + }, + { + "name" : "update", + "methods" : [ + { + "name" : "execute", + "return_type" : "void", + "params" : [ ] + }, + { + "name" : "getCtx", + "return_type" : "java.util.Map", + "params" : [ ] + }, + { + "name" : "getParams", + "return_type" : "java.util.Map", + "params" : [ ] + } + ] + } + ] +} +```` + +## Response fields + +The `GET _script_context` request returns the following response fields: + +| Field | Data type | Description | +:--- | :--- | :--- +| contexts | List | A list of all contexts. See [Script object](#script-context). | + +#### Script context + +| Field | Data type | Description | +:--- | :--- | :--- +| name | String | The context name. | +| methods | List | List of the context's allowable methods. See [Script object](#context-methods). | + +#### Context methods + +| Field | Data type | Description | +:--- | :--- | :--- +| name | String | Method name. | +| name | String | Type that the method returns (`boolean`, `object`, `number`, and so on). | +| params | List | List of the parameters accepted by the method. See [Script object](#method-parameters). | + +#### Method parameters + +| Field | Data type | Description | +:--- | :--- | :--- +| type | String | Parameter data type. | +| name | String | Parameter name. | \ No newline at end of file diff --git a/_api-reference/script-apis/get-script-language.md b/_api-reference/script-apis/get-script-language.md new file mode 100644 index 00000000..d53eb048 --- /dev/null +++ b/_api-reference/script-apis/get-script-language.md @@ -0,0 +1,101 @@ +--- +layout: default +title: Get Script Language +parent: Script APIs +nav_order: 6 +--- + +# Get script language +**Introduced 1.0** +{: .label .label-purple } + +The get script language API operation retrieves all supported script languages and their contexts. + +#### Example request + +```json +GET _script_language +``` +{% include copy-curl.html %} + +#### Example response + +The `GET _script_language` request returns the available contexts for each language: + +```json +{ + "types_allowed" : [ + "inline", + "stored" + ], + "language_contexts" : [ + { + "language" : "expression", + "contexts" : [ + "aggregation_selector", + "aggs", + "bucket_aggregation", + "field", + "filter", + "number_sort", + "score", + "terms_set" + ] + }, + { + "language" : "mustache", + "contexts" : [ + "template" + ] + }, + { + "language" : "opensearch_query_expression", + "contexts" : [ + "aggs", + "filter" + ] + }, + { + "language" : "painless", + "contexts" : [ + "aggregation_selector", + "aggs", + "aggs_combine", + "aggs_init", + "aggs_map", + "aggs_reduce", + "analysis", + "bucket_aggregation", + "field", + "filter", + "ingest", + "interval", + "moving-function", + "number_sort", + "painless_test", + "processor_conditional", + "score", + "script_heuristic", + "similarity", + "similarity_weight", + "string_sort", + "template", + "terms_set", + "trigger", + "update" + ] + } + ] +} +``` + +## Response fields + +The request contains the following response fields. + +Field | Data type | Description | +:--- | :--- | :--- +types_allowed | List of strings | The types of scripts that are enabled, determined by the `script.allowed_types` setting. May contain `inline` and/or `stored`. +language_contexts | List of objects | A list of objects, each of which maps a supported language to its available contexts. +language_contexts.language | String | The name of the registered scripting language. +language_contexts.contexts | List of strings | A list of all contexts for the language, determined by the `script.allowed_contexts` setting. diff --git a/_api-reference/script-apis/get-stored-script.md b/_api-reference/script-apis/get-stored-script.md new file mode 100644 index 00000000..cc681cd0 --- /dev/null +++ b/_api-reference/script-apis/get-stored-script.md @@ -0,0 +1,71 @@ +--- +layout: default +title: Get Stored Script +parent: Script APIs +nav_order: 3 +--- + +# Get stored script +**Introduced 1.0** +{: .label .label-purple } + +Retrieves a stored script. + +## Path parameters + +| Parameter | Data type | Description | +:--- | :--- | :--- +| script | String | Stored script or search template name. Required.| + +## Query parameters + +| Parameter | Data type | Description | +:--- | :--- | :--- +| cluster_manager_timeout | Time | Amount of time to wait for a connection to the cluster manager. Optional, defaults to `30s`. | + +#### Example request + +The following retrieves the `my-first-script` stored script. + +````json +GET _scripts/my-first-script +```` +{% include copy-curl.html %} + +#### Example response + +The `GET _scripts/my-first-script` request returns the following fields: + +````json +{ + "_id" : "my-first-script", + "found" : true, + "script" : { + "lang" : "painless", + "source" : """ + int total = 0; + for (int i = 0; i < doc['ratings'].length; ++i) { + total += doc['ratings'][i]; + } + return total; + """ + } +} +```` + +## Response fields + +The `GET _scripts/my-first-script` request returns the following response fields: + +| Field | Data type | Description | +:--- | :--- | :--- +| _id | String | The script's name. | +| found | Boolean | The requested script exists and was retrieved. | +| script | Object | The script definition. See [Script object](#script-object). | + +#### Script object + +| Field | Data type | Description | +:--- | :--- | :--- +| lang | String | The script's language. | +| source | String | The script's body. | \ No newline at end of file diff --git a/_api-reference/script-apis/index.md b/_api-reference/script-apis/index.md new file mode 100644 index 00000000..605faf33 --- /dev/null +++ b/_api-reference/script-apis/index.md @@ -0,0 +1,14 @@ +--- +layout: default +title: Script APIs +has_children: true +nav_order: 70 +redirect_from: + - /opensearch/rest-api/script-apis/ +--- + +# Script APIs +**Introduced 1.0** +{: .label .label-purple } + +The script APIs allow you to work with stored scripts. Stored scripts are part of the cluster state and reduce compilation time and enhance search speed. The default scripting language is Painless. diff --git a/_opensearch/rest-api/scroll.md b/_api-reference/scroll.md similarity index 93% rename from _opensearch/rest-api/scroll.md rename to _api-reference/scroll.md index 42317200..cee59990 100644 --- a/_opensearch/rest-api/scroll.md +++ b/_api-reference/scroll.md @@ -1,12 +1,13 @@ --- layout: default title: Scroll -parent: REST API reference -nav_order: 120 +nav_order: 71 +redirect_from: + - /opensearch/rest-api/scroll/ --- # Scroll -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } You can use the `scroll` operation to retrieve a large number of results. For example, for machine learning jobs, you can request an unlimited number of results in batches. @@ -26,6 +27,7 @@ GET shakespeare/_search?scroll=10m "size": 10000 } ``` +{% include copy-curl.html %} OpenSearch caches the results and returns a scroll ID to access them in batches: @@ -42,6 +44,7 @@ GET _search/scroll "scroll_id": "DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAUWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ==" } ``` +{% include copy-curl.html %} Using this scroll ID, you get results in batches of 10,000 as long as the search context is still open. Typically, the scroll ID does not change between requests, but it *can* change, so make sure to always use the latest scroll ID. If you don't send the next scroll request within the set search context, the `scroll` operation does not return any results. @@ -60,6 +63,7 @@ GET shakespeare/_search?scroll=10m } } ``` +{% include copy-curl.html %} With a single scroll ID, you get back 10 results. You can have up to 10 IDs. @@ -69,12 +73,14 @@ Close the search context when you’re done scrolling, because the `scroll` oper ```json DELETE _search/scroll/DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAcWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ== ``` +{% include copy-curl.html %} To close all open scroll contexts: ```json DELETE _search/scroll/_all ``` +{% include copy-curl.html %} The `scroll` operation corresponds to a specific timestamp. It doesn't consider documents added after that timestamp as potential results. diff --git a/_opensearch/search-template.md b/_api-reference/search-template.md similarity index 96% rename from _opensearch/search-template.md rename to _api-reference/search-template.md index 476e8049..3dcaf3f5 100644 --- a/_opensearch/search-template.md +++ b/_api-reference/search-template.md @@ -1,7 +1,10 @@ --- layout: default title: Search templates -nav_order: 50 +nav_order: 80 +redirect_from: + - /opensearch/search-template/ + - /search-plugins/search-template/ --- # Search templates @@ -205,6 +208,15 @@ POST _render/template } ``` +The following render operations are supported: + +```json +GET /_render/template +POST /_render/template +GET /_render/template/ +POST /_render/template/ +``` + ## Advanced parameter conversion with search templates You have a lot of different syntax options in Mustache to transpose the input parameters into a query. @@ -253,7 +265,7 @@ GET _search/template ### Loops -You can also use the section tag to implement a foreach loop: +You can also use the section tag to implement a for each loop: ``` {% raw %}{{#var}}{{.}}}{{/var}}{% endraw %} diff --git a/_api-reference/search.md b/_api-reference/search.md new file mode 100644 index 00000000..71d96bca --- /dev/null +++ b/_api-reference/search.md @@ -0,0 +1,225 @@ +--- +layout: default +title: Search +nav_order: 75 +redirect_from: + - /opensearch/rest-api/search/ +--- + +# Search +**Introduced 1.0** +{: .label .label-purple } + +The Search API operation lets you execute a search request to search your cluster for data. + +## Example + +```json +GET /movies/_search +{ + "query": { + "match": { + "text_entry": "I am the night" + } + } +} +``` +{% include copy-curl.html %} + +## Path and HTTP Methods + +``` +GET //_search +GET /_search + +POST //_search +POST /_search +``` + +## URL Parameters + +All URL parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is true. +allow_partial_search_results | Boolean | Whether to return partial results if the request runs into an error or times out. Default is true. +analyzer | String | Analyzer to use in the query string. +analyze_wildcard | Boolean | Whether the update operation should include wildcard and prefix queries in the analysis. Default is false. +batched_reduce_size | Integer | How many shard results to reduce on a node. Default is 512. +cancel_after_time_interval | Time | The time after which the search request will be canceled. Request-level parameter takes precedence over cancel_after_time_interval [cluster setting]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings). Default is -1. +ccs_minimize_roundtrips | Boolean | Whether to minimize roundtrips between a node and remote clusters. Default is true. +default_operator | String | Indicates whether the default operator for a string query should be AND or OR. Default is OR. +df | String | The default field in case a field prefix is not provided in the query string. +docvalue_fields | String | The fields that OpenSearch should return using their docvalue forms. +expand_wildcards | String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values. Valid values are all (match any index), open (match open, non-hidden indexes), closed (match closed, non-hidden indexes), hidden (match hidden indexes), and none (deny wildcard expressions). Default is open. +explain | Boolean | Whether to return details about how OpenSearch computed the document's score. Default is false. +from | Integer | The starting index to search from. Default is 0. +ignore_throttled | Boolean | Whether to ignore concrete, expanded, or indexes with aliases if indexes are frozen. Default is true. +ignore_unavailable | Boolean | Specifies whether to include missing or closed indexes in the response. Default is false. +lenient | Boolean | Specifies whether OpenSearch should accept requests if queries have format errors (for example, querying a text field for an integer). Default is false. +max_concurrent_shard_requests | Integer | How many concurrent shard requests this request should execute on each node. Default is 5. +phase_took | Boolean | Whether to return phase-level `took` time values in the response. Default is false. +pre_filter_shard_size | Integer | A prefilter size threshold that triggers a prefilter operation if the request exceeds the threshold. Default is 128 shards. +preference | String | Specifies the shards or nodes on which OpenSearch should perform the search. For valid values, see [The `preference` query parameter](#the-preference-query-parameter). +q | String | Lucene query string’s query. +request_cache | Boolean | Specifies whether OpenSearch should use the request cache. Default is whether it’s enabled in the index’s settings. +rest_total_hits_as_int | Boolean | Whether to return `hits.total` as an integer. Returns an object otherwise. Default is false. +routing | String | Value used to route the update by query operation to a specific shard. +scroll | Time | How long to keep the search context open. +search_type | String | Whether OpenSearch should use global term and document frequencies when calculating relevance scores. Valid choices are `query_then_fetch` and `dfs_query_then_fetch`. `query_then_fetch` scores documents using local term and document frequencies for the shard. It’s usually faster but less accurate. `dfs_query_then_fetch` scores documents using global term and document frequencies across all shards. It’s usually slower but more accurate. Default is `query_then_fetch`. +seq_no_primary_term | Boolean | Whether to return sequence number and primary term of the last operation of each document hit. +size | Integer | How many results to include in the response. +sort | List | A comma-separated list of <field> : <direction> pairs to sort by. +_source | String | Whether to include the `_source` field in the response. +_source_excludes | List | A comma-separated list of source fields to exclude from the response. +_source_includes | List | A comma-separated list of source fields to include in the response. +stats | String | Value to associate with the request for additional logging. +stored_fields | Boolean | Whether the get operation should retrieve fields stored in the index. Default is false. +suggest_field | String | Fields OpenSearch can use to look for similar terms. +suggest_mode | String | The mode to use when searching. Available options are `always` (use suggestions based on the provided terms), `popular` (use suggestions that have more occurrences), and `missing` (use suggestions for terms not in the index). +suggest_size | Integer | How many suggestions to return. +suggest_text | String | The source that suggestions should be based off of. +terminate_after | Integer | The maximum number of documents OpenSearch should process before terminating the request. Default is 0. +timeout | Time | How long the operation should wait for a response from active shards. Default is `1m`. +track_scores | Boolean | Whether to return document scores. Default is false. +track_total_hits | Boolean or Integer | Whether to return how many documents matched the query. +typed_keys | Boolean | Whether returned aggregations and suggested terms should include their types in the response. Default is true. +version | Boolean | Whether to include the document version as a match. +include_named_queries_score | Boolean | Whether to return scores with named queries. Default is false. + +### The `preference` query parameter + +The `preference` query parameter specifies the shards or nodes on which OpenSearch should perform the search. The following are valid values: + +- `_primary`: Perform the search only on primary shards. +- `_replica`: Perform the search only on replica shards. +- `_primary_first`: Perform the search on primary shards but fail over to other available shards if primary shards are not available. +- `_replica_first`: Perform the search on replica shards but fail over to other available shards if replica shards are not available. +- `_local`: If possible, perform the search on the local node's shards. +- `_prefer_nodes:,`: If possible, perform the search on the specified nodes. Use a comma-separated list to specify multiple nodes. +- `_shards:,`: Perform the search only on the specified shards. Use a comma-separated list to specify multiple shards. When combined with other preferences, the `_shards` preference must be listed first. For example, `_shards:1,2|_replica`. +- `_only_nodes:,`: Perform the search only on the specified nodes. Use a comma-separated list to specify multiple nodes. +- ``: Specifies a custom string to use for the search. The string cannot start with an underscore character (`_`). Searches with the same custom string are routed to the same shards. + +## Request body + +All fields are optional. + +Field | Type | Description +:--- | :--- | :--- +aggs | Object | In the optional `aggs` parameter, you can define any number of aggregations. Each aggregation is defined by its name and one of the types of aggregations that OpenSearch supports. For more information, see [Aggregations]({{site.url}}{{site.baseurl}}/aggregations/). +docvalue_fields | Array of objects | The fields that OpenSearch should return using their docvalue forms. Specify a format to return results in a certain format, such as date and time. +fields | Array | The fields to search for in the request. Specify a format to return results in a certain format, such as date and time. +explain | String | Whether to return details about how OpenSearch computed the document's score. Default is false. +from | Integer | The starting index to search from. Default is 0. +indices_boost | Array of objects | Values used to boost the score of specified indexes. Specify in the format of <index> : <boost-multiplier> +min_score | Integer | Specify a score threshold to return only documents above the threshold. +query | Object | The [DSL query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index) to use in the request. +seq_no_primary_term | Boolean | Whether to return sequence number and primary term of the last operation of each document hit. +size | Integer | How many results to return. Default is 10. +_source | | Whether to include the `_source` field in the response. +stats | String | Value to associate with the request for additional logging. +terminate_after | Integer | The maximum number of documents OpenSearch should process before terminating the request. Default is 0. +timeout | Time | How long to wait for a response. Default is no timeout. +version | Boolean | Whether to include the document version in the response. + +## Response body + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": "superheroes", + "_id": "1", + "_score": 1.0, + "_source": { + "superheroes": [ + { + "Hero name": "Superman", + "Real identity": "Clark Kent", + "Age": 28 + }, + { + "Hero name": "Batman", + "Real identity": "Bruce Wayne", + "Age": 26 + }, + { + "Hero name": "Flash", + "Real identity": "Barry Allen", + "Age": 28 + }, + { + "Hero name": "Robin", + "Real identity": "Dick Grayson", + "Age": 15 + } + ] + } + } + ] + } +} +``` + +## The `ext` object + +Starting with OpenSearch 2.10, plugin authors can add an `ext` object to the search response. The purpose of the `ext` object is to contain plugin-specific response fields. For example, in conversational search, the result of Retrieval Augmented Generation (RAG) is a single "hit" (answer). Plugin authors can include this answer in the search response as part of the `ext` object so that it is separate from the search hits. In the following example response, the RAG result is in the `ext.retrieval_augmented_generation.answer` field: + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 3, + "successful": 3, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 110, + "relation": "eq" + }, + "max_score": 0.55129033, + "hits": [ + { + "_index": "...", + "_id": "...", + "_score": 0.55129033, + "_source": { + "text": "...", + "title": "..." + } + }, + { + ... + } + ... + { + ... + } + ], + }, // end of hits + "ext": { + "retrieval_augmented_generation": { // a search response processor + "answer": "RAG answer" + } + } +} +``` diff --git a/_api-reference/snapshots/create-repository.md b/_api-reference/snapshots/create-repository.md new file mode 100644 index 00000000..856332b7 --- /dev/null +++ b/_api-reference/snapshots/create-repository.md @@ -0,0 +1,115 @@ +--- +layout: default +title: Register Snapshot Repository +parent: Snapshot APIs +nav_order: 1 +--- + +# Registering or updating a snapshot repository +**Introduced 1.0** +{: .label .label-purple } + +You can register a new repository in which to store snapshots or update information for an existing repository by using the snapshots API. + +There are two types of snapshot repositories: + +* File system (`fs`): For instructions on creating an `fs` repository, see [Register repository shared file system]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore/#shared-file-system). + +* Amazon Simple Storage Service (Amazon S3) bucket (`s3`): For instructions on creating an `s3` repository, see [Register repository Amazon S3]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore/#amazon-s3). + +For instructions on creating a repository, see [Register repository]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#register-repository). + +## Path and HTTP methods + +``` +POST /_snapshot/my-first-repo/ +PUT /_snapshot/my-first-repo/ +``` + +## Path parameters + +Parameter | Data type | Description +:--- | :--- | :--- +repository | String | Repository name | + +## Request parameters + +Request parameters depend on the type of repository: `fs` or `s3`. + +### fs repository + +Request field | Description +:--- | :--- +`location` | The file system directory for snapshots, such as a mounted directory from a file server or a Samba share. Must be accessible by all nodes. Required. +`chunk_size` | Breaks large files into chunks during snapshot operations (e.g. `64mb`, `1gb`), which is important for cloud storage providers and far less important for shared file systems. Default is `null` (unlimited). Optional. +`compress` | Whether to compress metadata files. This setting does not affect data files, which might already be compressed, depending on your index settings. Default is `false`. Optional. +`max_restore_bytes_per_sec` | The maximum rate at which snapshots restore. Default is 40 MB per second (`40m`). Optional. +`max_snapshot_bytes_per_sec` | The maximum rate at which snapshots take. Default is 40 MB per second (`40m`). Optional. +`remote_store_index_shallow_copy` | Boolean | Determines whether the snapshot of the remote store indexes are captured as a shallow copy. Default is `false`. +`readonly` | Whether the repository is read-only. Useful when migrating from one cluster (`"readonly": false` when registering) to another cluster (`"readonly": true` when registering). Optional. + +#### Example request + +The following example registers an `fs` repository using the local directory `/mnt/snapshots` as `location`. + +```json +PUT /_snapshot/my-fs-repository +{ + "type": "fs", + "settings": { + "location": "/mnt/snapshots" + } +} +``` +{% include copy-curl.html %} + +#### s3 repository + +Request field | Description +:--- | :--- +`base_path` | The path within the bucket in which you want to store snapshots (for example, `my/snapshot/directory`). Optional. If not specified, snapshots are stored in the S3 bucket root. +`bucket` | Name of the S3 bucket. Required. +`buffer_size` | The threshold beyond which chunks (of `chunk_size`) should be broken into pieces (of `buffer_size`) and sent to S3 using a different API. Default is the smaller of two values: 100 MB or 5% of the Java heap. Valid values are between `5mb` and `5gb`. We don't recommend changing this option. +`canned_acl` | S3 has several [canned ACLs](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl) that the `repository-s3` plugin can add to objects as it creates them in S3. Default is `private`. Optional. +`chunk_size` | Breaks files into chunks during snapshot operations (e.g. `64mb`, `1gb`), which is important for cloud storage providers and far less important for shared file systems. Default is `1gb`. Optional. +`client` | When specifying client settings (e.g. `s3.client.default.access_key`), you can use a string other than `default` (e.g. `s3.client.backup-role.access_key`). If you used an alternate name, change this value to match. Default and recommended value is `default`. Optional. +`compress` | Whether to compress metadata files. This setting does not affect data files, which might already be compressed, depending on your index settings. Default is `false`. Optional. +`disable_chunked_encoding` | Disables chunked encoding for compatibility with some storage services. Default is `false`. Optional. +`max_restore_bytes_per_sec` | The maximum rate at which snapshots restore. Default is 40 MB per second (`40m`). Optional. +`max_snapshot_bytes_per_sec` | The maximum rate at which snapshots take. Default is 40 MB per second (`40m`). Optional. +`readonly` | Whether the repository is read-only. Useful when migrating from one cluster (`"readonly": false` when registering) to another cluster (`"readonly": true` when registering). Optional. +`remote_store_index_shallow_copy` | Boolean | Whether the snapshot of the remote store indexes is captured as a shallow copy. Default is `false`. +`server_side_encryption` | Whether to encrypt snapshot files in the S3 bucket. This setting uses AES-256 with S3-managed keys. See [Protecting data using server-side encryption](https://docs.aws.amazon.com/AmazonS3/latest/dev/serv-side-encryption.html). Default is false. Optional. +`storage_class` | Specifies the [S3 storage class](https://docs.aws.amazon.com/AmazonS3/latest/dev/storage-class-intro.html) for the snapshots files. Default is `standard`. Do not use the `glacier` and `deep_archive` storage classes. Optional. + +For the `base_path` parameter, do not enter the `s3://` prefix when entering your S3 bucket details. Only the name of the bucket is required. +{: .note} + +#### Example request + +The following request registers a new S3 repository called `my-opensearch-repo` in an existing bucket called `my-open-search-bucket`. By default, all snapshots are stored in the `my/snapshot/directory`. + +```json +PUT /_snapshot/my-opensearch-repo +{ + "type": "s3", + "settings": { + "bucket": "my-open-search-bucket", + "base_path": "my/snapshot/directory" + } +} +``` +{% include copy-curl.html %} + +#### Example response + +Upon success, the following JSON object is returned: + +```json +{ + "acknowledged": true +} +``` + +To verify that the repository was registered, use the [Get snapshot repository]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot-repository) API, passing the repository name as the `repository` path parameter. +{: .note} \ No newline at end of file diff --git a/_api-reference/snapshots/create-snapshot.md b/_api-reference/snapshots/create-snapshot.md new file mode 100644 index 00000000..4f0a6d05 --- /dev/null +++ b/_api-reference/snapshots/create-snapshot.md @@ -0,0 +1,147 @@ +--- +layout: default +title: Create Snapshot +parent: Snapshot APIs +nav_order: 5 +--- + +# Create snapshot +**Introduced 1.0** +{: .label .label-purple } + +Creates a snapshot within an existing repository. + +* To learn more about snapshots, see [Snapshots]({{site.url}}{{site.baseurl}}/opensearch/snapshots/index). + +* To view a list of your repositories, see [Get snapshot repository]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot-repository). + +## Path and HTTP methods + +```json +PUT /_snapshot// +POST /_snapshot// +``` + +## Path parameters + +Parameter | Data type | Description +:--- | :--- | :--- +repository | String | Repostory name to contain the snapshot. | +snapshot | String | Name of Snapshot to create. | + +## Query parameters + +Parameter | Data type | Description +:--- | :--- | :--- +wait_for_completion | Boolean | Whether to wait for snapshot creation to complete before continuing. If you include this parameter, the snapshot definition is returned after completion. | + +## Request fields + +The request body is optional. + +Field | Data type | Description +:--- | :--- | :--- +`indices` | String | The indices you want to include in the snapshot. You can use `,` to create a list of indices, `*` to specify an index pattern, and `-` to exclude certain indices. Don't put spaces between items. Default is all indices. +`ignore_unavailable` | Boolean | If an index from the `indices` list doesn't exist, whether to ignore it rather than fail the snapshot. Default is false. +`include_global_state` | Boolean | Whether to include cluster state in the snapshot. Default is true. +`partial` | Boolean | Whether to allow partial snapshots. Default is false, which fails the entire snapshot if one or more shards fails to stor + +#### Example requests + +##### Request without a body + +The following request creates a snapshot called `my-first-snapshot` in an S3 repository called `my-s3-repository`. A request body is not included because it is optional. + +```json +POST _snapshot/my-s3-repository/my-first-snapshot +``` +{% include copy-curl.html %} + +##### Request with a body + +You can also add a request body to include or exclude certain indices or specify other settings: + +```json +PUT _snapshot/my-s3-repository/2 +{ + "indices": "opensearch-dashboards*,my-index*,-my-index-2016", + "ignore_unavailable": true, + "include_global_state": false, + "partial": false +} +``` +{% include copy-curl.html %} + +#### Example responses + +Upon success, the response content depends on whether you include the `wait_for_completion` query parameter. + +##### `wait_for_completion` not included + +```json +{ + "accepted": true +} +``` + +To verify that the snapshot was created, use the [Get snapshot]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot) API, passing the snapshot name as the `snapshot` path parameter. +{: .note} + +##### `wait_for_completion` included + +The snapshot definition is returned. + +```json +{ + "snapshot" : { + "snapshot" : "5", + "uuid" : "ZRH4Zv7cSnuYev2JpLMJGw", + "version_id" : 136217927, + "version" : "2.0.1", + "indices" : [ + ".opendistro-reports-instances", + ".opensearch-observability", + ".kibana_1", + "opensearch_dashboards_sample_data_flights", + ".opensearch-notifications-config", + ".opendistro-reports-definitions", + "shakespeare" + ], + "data_streams" : [ ], + "include_global_state" : true, + "state" : "SUCCESS", + "start_time" : "2022-08-10T16:52:15.277Z", + "start_time_in_millis" : 1660150335277, + "end_time" : "2022-08-10T16:52:18.699Z", + "end_time_in_millis" : 1660150338699, + "duration_in_millis" : 3422, + "failures" : [ ], + "shards" : { + "total" : 7, + "failed" : 0, + "successful" : 7 + } + } +} +``` + +#### Response fields + +| Field | Data type | Description | +| :--- | :--- | :--- | +| snapshot | string | Snapshot name. | +| uuid | string | Snapshot's universally unique identifier (UUID). | +| version_id | int | Build ID of the Open Search version that created the snapshot. | +| version | float | Open Search version that created the snapshot. | +| indices | array | Indices in the snapshot. | +| data_streams | array | Data streams in the snapshot. | +| include_global_state | boolean | Whether the current cluster state is included in the snapshot. | +| start_time | string | Date/time when the snapshot creation process began. | +| start_time_in_millis | long | Time (in milliseconds) when the snapshot creation process began. | +| end_time | string | Date/time when the snapshot creation process ended. | +| end_time_in_millis | long | Time (in milliseconds) when the snapshot creation process ended. | +| duration_in_millis | long | Total time (in milliseconds) that the snapshot creation process lasted. | +| failures | array | Failures, if any, that occured during snapshot creation. | +| shards | object | Total number of shards created along with number of successful and failed shards. | +| state | string | Snapshot status. Possible values: `IN_PROGRESS`, `SUCCESS`, `FAILED`, `PARTIAL`. | +| remote_store_index_shallow_copy | Boolean | Whether the snapshot of the remote store indexes is captured as a shallow copy. Default is `false`. | \ No newline at end of file diff --git a/_api-reference/snapshots/delete-snapshot-repository.md b/_api-reference/snapshots/delete-snapshot-repository.md new file mode 100644 index 00000000..385205a5 --- /dev/null +++ b/_api-reference/snapshots/delete-snapshot-repository.md @@ -0,0 +1,44 @@ +--- +layout: default +title: Delete Snapshot Repository +parent: Snapshot APIs +nav_order: 3 +--- + +# Delete snapshot repository configuration +**Introduced 1.0** +{: .label .label-purple } + + Deletes a snapshot repository configuration. + + A repository in OpenSearch is simply a configuration that maps a repository name to a type (file system or s3 repository) along with other information depending on the type. The configuration is backed by a file system location or an s3 bucket. When you invoke the API, the physical file system or s3 bucket itself is not deleted. Only the configuration is deleted. + + To learn more about repositories, see [Register or update snapshot repository]({{site.url}}{{site.baseurl}}/api-reference/snapshots/create-repository). + +## Path parameters + +Parameter | Data type | Description +:--- | :--- | :--- +repository | String | Repository to delete. | + +#### Example request + +The following request deletes the `my-opensearch-repo` repository: + +````json +DELETE _snapshot/my-opensearch-repo +```` +{% include copy-curl.html %} + +#### Example response + +Upon success, the response returns the following JSON object: + +````json +{ + "acknowledged" : true +} +```` + +To verify that the repository was deleted, use the [Get snapshot repository]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot-repository) API, passing the repository name as the `repository` path parameter. +{: .note} \ No newline at end of file diff --git a/_api-reference/snapshots/delete-snapshot.md b/_api-reference/snapshots/delete-snapshot.md new file mode 100644 index 00000000..e4232c20 --- /dev/null +++ b/_api-reference/snapshots/delete-snapshot.md @@ -0,0 +1,47 @@ +--- +layout: default +title: Delete Snapshot +parent: Snapshot APIs +nav_order: 7 +--- + +## Delete snapshot +**Introduced 1.0** +{: .label .label-purple } + +Deletes a snapshot from a repository. + +* To learn more about snapshots, see [Snapshots]({{site.url}}{{site.baseurl}}/opensearch/snapshots/index). + +* To view a list of your repositories, see [cat repositories]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-repositories). + +* To view a list of your snapshots, see [cat snapshots]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-snapshots). + +## Path parameters + +Parameter | Data type | Description +:--- | :--- | :--- +repository | String | Repostory that contains the snapshot. | +snapshot | String | Snapshot to delete. | + +#### Example request + +The following request deletes a snapshot called `my-first-snapshot` from the `my-opensearch-repo` repository: + +```json +DELETE _snapshot/my-opensearch-repo/my-first-snapshot +``` +{% include copy-curl.html %} + +#### Example response + +Upon success, the response returns the following JSON object: + +```json +{ + "acknowledged": true +} +``` + +To verify that the snapshot was deleted, use the [Get snapshot]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot) API, passing the snapshot name as the `snapshot` path parameter. +{: .note} diff --git a/_api-reference/snapshots/get-snapshot-repository.md b/_api-reference/snapshots/get-snapshot-repository.md new file mode 100644 index 00000000..e3664e11 --- /dev/null +++ b/_api-reference/snapshots/get-snapshot-repository.md @@ -0,0 +1,63 @@ +--- +layout: default +title: Get Snapshot Repository +parent: Snapshot APIs +nav_order: 2 +--- + +# Get snapshot repository. +**Introduced 1.0** +{: .label .label-purple } + +Retrieves information about a snapshot repository. + +To learn more about repositories, see [Register repository]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#register-repository). + +You can also get details about a snapshot during and after snapshot creation. See [Get snapshot status]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot-status/). +{: .note} + +## Path parameters + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| repository | String | A comma-separated list of snapshot repository names to retrieve. Wildcard (`*`) expressions are supported including combining wildcards with exclude patterns starting with `-`. | + +## Query parameters + +| Parameter | Data type | Description | +:--- | :--- | :--- +| local | Boolean | Whether to get information from the local node. Optional, defaults to `false`.| +| cluster_manager_timeout | Time | Amount of time to wait for a connection to the master node. Optional, defaults to 30 seconds. | + +#### Example request + +The following request retrieves information for the `my-opensearch-repo` repository: + +````json +GET /_snapshot/my-opensearch-repo +```` +{% include copy-curl.html %} + +#### Example response + +Upon success, the response returns repositry information. This sample is for an `s3` repository type. + +````json +{ + "my-opensearch-repo" : { + "type" : "s3", + "settings" : { + "bucket" : "my-open-search-bucket", + "base_path" : "snapshots" + } + } +} +```` + +## Response fields + +| Field | Data type | Description | +| :--- | :--- | :--- | +| type | string | Bucket type: `fs` (file system) or `s3` (s3 bucket) | +| bucket | string | S3 bucket name. | +| base_path | string | Folder within the bucket where snapshots are stored. | \ No newline at end of file diff --git a/_api-reference/snapshots/get-snapshot-status.md b/_api-reference/snapshots/get-snapshot-status.md new file mode 100644 index 00000000..02aa4190 --- /dev/null +++ b/_api-reference/snapshots/get-snapshot-status.md @@ -0,0 +1,423 @@ +--- +layout: default +title: Get Snapshot Status +parent: Snapshot APIs +nav_order: 8 +--- + +# Get snapshot status +**Introduced 1.0** +{: .label .label-purple } + +Returns details about a snapshot’s state during and after snapshot creation. + +To learn about snapshot creation, see [Create snapshot]({{site.url}}{{site.baseurl}}/api-reference/snapshots/create-snapshot). + +If you use the Security plugin, you must have the `monitor_snapshot`, `create_snapshot`, or `manage cluster` privileges. +{: .note} + +## Path parameters + +Path parameters are optional. + +| Parameter | Data type | Description | +:--- | :--- | :--- +| repository | String | Repository containing the snapshot. | +| snapshot | String | Snapshot to return. | + +Three request variants provide flexibility: + +* `GET _snapshot/_status` returns the status of all currently running snapshots in all repositories. + +* `GET _snapshot//_status` returns the status of only currently running snapshots in the specified repository. This is the preferred variant. + +* `GET _snapshot///_status` returns the status of all snapshots in the specified repository whether they are running or not. + +Using the API to return state for other than currently running snapshots can be very costly for (1) machine machine resources and (2) processing time if running in the cloud. For each snapshot, each request causes file reads from all a snapshot's shards. +{: .warning} + +## Request fields + +| Field | Data type | Description | +:--- | :--- | :--- +| ignore_unavailable | Boolean | How to handles requests for unavailable snapshots. If `false`, the request returns an error for unavailable snapshots. If `true`, the request ignores unavailable snapshots, such as those that are corrupted or temporarily cannot be returned. Defaults to `false`.| + +#### Example request + +The following request returns the status of `my-first-snapshot` in the `my-opensearch-repo` repository. Unavailable snapshots are ignored. + +````json +GET _snapshot/my-opensearch-repo/my-first-snapshot/_status +{ + "ignore_unavailable": true +} +```` +{% include copy-curl.html %} + +#### Example response + +The example that follows corresponds to the request above in the [Example request](#example-request) section. + +The `GET _snapshot/my-opensearch-repo/my-first-snapshot/_status` request returns the following fields: + +````json +{ + "snapshots" : [ + { + "snapshot" : "my-first-snapshot", + "repository" : "my-opensearch-repo", + "uuid" : "dCK4Qth-TymRQ7Tu7Iga0g", + "state" : "SUCCESS", + "include_global_state" : true, + "shards_stats" : { + "initializing" : 0, + "started" : 0, + "finalizing" : 0, + "done" : 7, + "failed" : 0, + "total" : 7 + }, + "stats" : { + "incremental" : { + "file_count" : 31, + "size_in_bytes" : 24488927 + }, + "total" : { + "file_count" : 31, + "size_in_bytes" : 24488927 + }, + "start_time_in_millis" : 1660666841667, + "time_in_millis" : 14054 + }, + "indices" : { + ".opensearch-observability" : { + "shards_stats" : { + "initializing" : 0, + "started" : 0, + "finalizing" : 0, + "done" : 1, + "failed" : 0, + "total" : 1 + }, + "stats" : { + "incremental" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "total" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "start_time_in_millis" : 1660666841868, + "time_in_millis" : 201 + }, + "shards" : { + "0" : { + "stage" : "DONE", + "stats" : { + "incremental" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "total" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "start_time_in_millis" : 1660666841868, + "time_in_millis" : 201 + } + } + } + }, + "shakespeare" : { + "shards_stats" : { + "initializing" : 0, + "started" : 0, + "finalizing" : 0, + "done" : 1, + "failed" : 0, + "total" : 1 + }, + "stats" : { + "incremental" : { + "file_count" : 4, + "size_in_bytes" : 18310117 + }, + "total" : { + "file_count" : 4, + "size_in_bytes" : 18310117 + }, + "start_time_in_millis" : 1660666842470, + "time_in_millis" : 13050 + }, + "shards" : { + "0" : { + "stage" : "DONE", + "stats" : { + "incremental" : { + "file_count" : 4, + "size_in_bytes" : 18310117 + }, + "total" : { + "file_count" : 4, + "size_in_bytes" : 18310117 + }, + "start_time_in_millis" : 1660666842470, + "time_in_millis" : 13050 + } + } + } + }, + "opensearch_dashboards_sample_data_flights" : { + "shards_stats" : { + "initializing" : 0, + "started" : 0, + "finalizing" : 0, + "done" : 1, + "failed" : 0, + "total" : 1 + }, + "stats" : { + "incremental" : { + "file_count" : 10, + "size_in_bytes" : 6132245 + }, + "total" : { + "file_count" : 10, + "size_in_bytes" : 6132245 + }, + "start_time_in_millis" : 1660666843476, + "time_in_millis" : 6221 + }, + "shards" : { + "0" : { + "stage" : "DONE", + "stats" : { + "incremental" : { + "file_count" : 10, + "size_in_bytes" : 6132245 + }, + "total" : { + "file_count" : 10, + "size_in_bytes" : 6132245 + }, + "start_time_in_millis" : 1660666843476, + "time_in_millis" : 6221 + } + } + } + }, + ".opendistro-reports-definitions" : { + "shards_stats" : { + "initializing" : 0, + "started" : 0, + "finalizing" : 0, + "done" : 1, + "failed" : 0, + "total" : 1 + }, + "stats" : { + "incremental" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "total" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "start_time_in_millis" : 1660666843076, + "time_in_millis" : 200 + }, + "shards" : { + "0" : { + "stage" : "DONE", + "stats" : { + "incremental" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "total" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "start_time_in_millis" : 1660666843076, + "time_in_millis" : 200 + } + } + } + }, + ".opendistro-reports-instances" : { + "shards_stats" : { + "initializing" : 0, + "started" : 0, + "finalizing" : 0, + "done" : 1, + "failed" : 0, + "total" : 1 + }, + "stats" : { + "incremental" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "total" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "start_time_in_millis" : 1660666841667, + "time_in_millis" : 201 + }, + "shards" : { + "0" : { + "stage" : "DONE", + "stats" : { + "incremental" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "total" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "start_time_in_millis" : 1660666841667, + "time_in_millis" : 201 + } + } + } + }, + ".kibana_1" : { + "shards_stats" : { + "initializing" : 0, + "started" : 0, + "finalizing" : 0, + "done" : 1, + "failed" : 0, + "total" : 1 + }, + "stats" : { + "incremental" : { + "file_count" : 13, + "size_in_bytes" : 45733 + }, + "total" : { + "file_count" : 13, + "size_in_bytes" : 45733 + }, + "start_time_in_millis" : 1660666842673, + "time_in_millis" : 2007 + }, + "shards" : { + "0" : { + "stage" : "DONE", + "stats" : { + "incremental" : { + "file_count" : 13, + "size_in_bytes" : 45733 + }, + "total" : { + "file_count" : 13, + "size_in_bytes" : 45733 + }, + "start_time_in_millis" : 1660666842673, + "time_in_millis" : 2007 + } + } + } + }, + ".opensearch-notifications-config" : { + "shards_stats" : { + "initializing" : 0, + "started" : 0, + "finalizing" : 0, + "done" : 1, + "failed" : 0, + "total" : 1 + }, + "stats" : { + "incremental" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "total" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "start_time_in_millis" : 1660666842270, + "time_in_millis" : 200 + }, + "shards" : { + "0" : { + "stage" : "DONE", + "stats" : { + "incremental" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "total" : { + "file_count" : 1, + "size_in_bytes" : 208 + }, + "start_time_in_millis" : 1660666842270, + "time_in_millis" : 200 + } + } + } + } + } + } + ] +} +```` + +## Response fields + +| Field | Data type | Description | +:--- | :--- | :--- +| repository | String | Name of repository that contains the snapshot. | +| snapshot | String | Snapshot name. | +| uuid | String | Snapshot Universally unique identifier (UUID). | +| state | String | Snapshot's current status. See [Snapshot states](#snapshot-states). | +| include_global_state | Boolean | Whether the current cluster state is included in the snapshot. | +| shards_stats | Object | Snapshot's shard counts. See [Shard stats](#shard-stats). | +| stats | Object | Details of files included in the snapshot. `file_count`: number of files. `size_in_bytes`: total of all fie sizes. See [Snapshot file stats](#snapshot-file-stats). | +| index | list of Objects | List of objects that contain information about the indices in the snapshot. See [Index objects](#index-objects).| + +##### Snapshot states + +| State | Description | +:--- | :--- | +| FAILED | The snapshot terminated in an error and no data was stored. | +| IN_PROGRESS | The snapshot is currently running. | +| PARTIAL | The global cluster state was stored, but data from at least one shard was not stored. The `failures` property of the [Create snapshot]({{site.url}}{{site.baseurl}}/api-reference/snapshots/create-snapshot) response contains additional details. | +| SUCCESS | The snapshot finished and all shards were stored successfully. | + +##### Shard stats + +All property values are Integers. + +| Property | Description | +:--- | :--- | +| initializing | Number of shards that are still initializing. | +| started | Number of shards that have started but not are not finalized. | +| finalizing | Number of shards that are finalizing but are not done. | +| done | Number of shards that initialized, started, and finalized successfully. | +| failed | Number of shards that failed to be included in the snapshot. | +| total | Total number of shards included in the snapshot. | + +##### Snapshot file stats + +| Property | Type | Description | +:--- | :--- | :--- | +| incremental | Object | Number and size of files that still need to be copied during snapshot creation. For completed snapshots, `incremental` provides the number and size of files that were not already in the repository and were copied as part of the incremental snapshot. | +| processed | Object | Number and size of files already uploaded to the snapshot. The processed `file_count` and `size_in_bytes` are incremented in stats after a file is uploaded. | +| total | Object | Total number and size of files that are referenced by the snapshot. | +| start_time_in_millis | Long | Time (in milliseconds) when snapshot creation began. | +| time_in_millis | Long | Total time (in milliseconds) that the snapshot took to complete. | + +##### Index objects + +| Property | Type | Description | +:--- | :--- | :--- | +| shards_stats | Object | See [Shard stats](#shard-stats). | +| stats | Object | See [Snapshot file stats](#snapshot-file-stats). | +| shards | list of Objects | List of objects containing information about the shards that include the snapshot. Properies of the shards are listed below in bold text.

**stage**: Current state of shards in the snapshot. Shard states are:

* DONE: Number of shards in the snapshot that were successfully stored in the repository.

* FAILURE: Number of shards in the snapshot that were not successfully stored in the repository.

* FINALIZE: Number of shards in the snapshot that are in the finalizing stage of being stored in the repository.

* INIT: Number of shards in the snapshot that are in the initializing stage of being stored in the repository.

* STARTED: Number of shards in the snapshot that are in the started stage of being stored in the repository.

**stats**: See [Snapshot file stats](#snapshot-file-stats).

**total**: Total number and size of files referenced by the snapshot.

**start_time_in_millis**: Time (in milliseconds) when snapshot creation began.

**time_in_millis**: Total time (in milliseconds) that the snapshot took to complete. | \ No newline at end of file diff --git a/_api-reference/snapshots/get-snapshot.md b/_api-reference/snapshots/get-snapshot.md new file mode 100644 index 00000000..da44c1f2 --- /dev/null +++ b/_api-reference/snapshots/get-snapshot.md @@ -0,0 +1,94 @@ +--- +layout: default +title: Get Snapshot +parent: Snapshot APIs +nav_order: 6 +--- + +# Get snapshot. +**Introduced 1.0** +{: .label .label-purple } + +Retrieves information about a snapshot. + +## Path parameters + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| repository | String | The repository that contains the snapshot to retrieve. | +| snapshot | String | Snapshot to retrieve. + +## Query parameters + +| Parameter | Data type | Description | +:--- | :--- | :--- +| verbose | Boolean | Whether to show all, or just basic snapshot information. If `true`, returns all information. If `false`, omits information like start/end times, failures, and shards. Optional, defaults to `true`.| +| ignore_unavailable | Boolean | How to handle snapshots that are unavailable (corrupted or otherwise temporarily can't be returned). If `true` and the snapshot is unavailable, the request does not return the snapshot. If `false` and the snapshot is unavailable, the request returns an error. Optional, defaults to `false`.| + +#### Example request + +The following request retrieves information for the `my-first-snapshot` located in the `my-opensearch-repo` repository: + +````json +GET _snapshot/my-opensearch-repo/my-first-snapshot +```` +{% include copy-curl.html %} + +#### Example response + +Upon success, the response returns snapshot information: + +````json +{ + "snapshots" : [ + { + "snapshot" : "my-first-snapshot", + "uuid" : "3P7Qa-M8RU6l16Od5n7Lxg", + "version_id" : 136217927, + "version" : "2.0.1", + "indices" : [ + ".opensearch-observability", + ".opendistro-reports-instances", + ".opensearch-notifications-config", + "shakespeare", + ".opendistro-reports-definitions", + "opensearch_dashboards_sample_data_flights", + ".kibana_1" + ], + "data_streams" : [ ], + "include_global_state" : true, + "state" : "SUCCESS", + "start_time" : "2022-08-11T20:30:00.399Z", + "start_time_in_millis" : 1660249800399, + "end_time" : "2022-08-11T20:30:14.851Z", + "end_time_in_millis" : 1660249814851, + "duration_in_millis" : 14452, + "failures" : [ ], + "shards" : { + "total" : 7, + "failed" : 0, + "successful" : 7 + } + } + ] +} +```` +## Response fields + +| Field | Data type | Description | +| :--- | :--- | :--- | +| snapshot | string | Snapshot name. | +| uuid | string | Snapshot's universally unique identifier (UUID). | +| version_id | int | Build ID of the Open Search version that created the snapshot. | +| version | float | Open Search version that created the snapshot. | +| indices | array | Indices in the snapshot. | +| data_streams | array | Data streams in the snapshot. | +| include_global_state | boolean | Whether the current cluster state is included in the snapshot. | +| start_time | string | Date/time when the snapshot creation process began. | +| start_time_in_millis | long | Time (in milliseconds) when the snapshot creation process began. | +| end_time | string | Date/time when the snapshot creation process ended. | +| end_time_in_millis | long | Time (in milliseconds) when the snapshot creation process ended. | +| duration_in_millis | long | Total time (in milliseconds) that the snapshot creation process lasted. | +| failures | array | Failures, if any, that occured during snapshot creation. | +| shards | object | Total number of shards created along with number of successful and failed shards. | +| state | string | Snapshot status. Possible values: `IN_PROGRESS`, `SUCCESS`, `FAILED`, `PARTIAL`. | \ No newline at end of file diff --git a/_api-reference/snapshots/index.md b/_api-reference/snapshots/index.md new file mode 100644 index 00000000..e341905a --- /dev/null +++ b/_api-reference/snapshots/index.md @@ -0,0 +1,15 @@ +--- +layout: default +title: Snapshot APIs +has_children: true +nav_order: 80 +redirect_from: + - /opensearch/rest-api/document-apis/ + - /opensearch/rest-api/snapshots/ +--- + +# Snapshot APIs +**Introduced 1.0** +{: .label .label-purple } + +The snapshot APIs allow you to manage snapshots and snapshot repositories. \ No newline at end of file diff --git a/_api-reference/snapshots/restore-snapshot.md b/_api-reference/snapshots/restore-snapshot.md new file mode 100644 index 00000000..7b82f722 --- /dev/null +++ b/_api-reference/snapshots/restore-snapshot.md @@ -0,0 +1,120 @@ +--- +layout: default +title: Restore Snapshot +parent: Snapshot APIs + +nav_order: 9 +--- + +# Restore Snapshot +**Introduced 1.0** +{: .label .label-purple } + +Restores a snapshot of a cluster or specified data streams and indices. + +* For information about indices and clusters, see [Introduction to OpenSearch]({{site.url}}{{site.baseurl}}/opensearch/index). + +* For information about data streams, see [Data streams]({{site.url}}{{site.baseurl}}/opensearch/data-streams). + +If open indexes with the same name that you want to restore already exist in the cluster, you must close, delete, or rename the indexes. See [Example request](#example-request) for information about renaming an index. See [Close index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/close-index) for information about closing an index. +{: .note} + +## Path parameters + +| Parameter | Data type | Description | +:--- | :--- | :--- +repository | String | Repository containing the snapshot to restore. | +| snapshot | String | Snapshot to restore. | + +## Query parameters + +Parameter | Data type | Description +:--- | :--- | :--- +wait_for_completion | Boolean | Whether to wait for snapshot restoration to complete before continuing. | + +### Request fields + +All request body parameters are optional. + +| Parameter | Data type | Description | +:--- | :--- | :--- +| ignore_unavailable | Boolean | How to handle data streams or indices that are missing or closed. If `false`, the request returns an error for any data stream or index that is missing or closed. If `true`, the request ignores data streams and indices in indices that are missing or closed. Defaults to `false`. | +| ignore_index_settings | Boolean | A comma-delimited list of index settings that you don't want to restore from a snapshot. | +| include_aliases | Boolean | How to handle index aliases from the original snapshot. If `true`, index aliases from the original snapshot are restored. If `false`, aliases along with associated indices are not restored. Defaults to `true`. | +| include_global_state | Boolean | Whether to restore the current cluster state1. If `false`, the cluster state is not restored. If true, the current cluster state is restored. Defaults to `false`.| +| index_settings | String | A comma-delimited list of settings to add or change in all restored indices. Use this parameter to override index settings during snapshot restoration. For data streams, these index settings are applied to the restored backing indices. | +| indices | String | A comma-delimited list of data streams and indices to restore from the snapshot. Multi-index syntax is supported. By default, a restore operation includes all data streams and indices in the snapshot. If this argument is provided, the restore operation only includes the data streams and indices that you specify. | +| partial | Boolean | How the restore operation will behave if indices in the snapshot do not have all primary shards available. If `false`, the entire restore operation fails if any indices in the snapshot do not have all primary shards available.

If `true`, allows the restoration of a partial snapshot of indices with unavailable shards. Only shards that were successfully included in the snapshot are restored. All missing shards are recreated as empty. By default, the entire restore operation fails if one or more indices included in the snapshot do not have all primary shards available. To change this behavior, set `partial` to `true`. Defaults to `false`. | +| rename_pattern | String | The pattern to apply to restored data streams and indices. Data streams and indices matching the rename pattern will be renamed according to `rename_replacement`.

The rename pattern is applied as defined by the regular expression that supports referencing the original text.

The request fails if two or more data streams or indices are renamed into the same name. If you rename a restored data stream, its backing indices are also renamed. For example, if you rename the logs data stream to `recovered-logs`, the backing index `.ds-logs-1` is renamed to `.ds-recovered-logs-1`.

If you rename a restored stream, ensure an index template matches the new stream name. If there are no matching index template names, the stream cannot roll over and new backing indices are not created.| +| rename_replacement | String | The rename replacement string. See `rename_pattern` for more information.| +| source_remote_store_repository | String | The name of the remote store repository of the source index being restored. If not provided, the Snapshot Restore API will use the repository that was registered when the snapshot was created. +| wait_for_completion | Boolean | Whether to return a response after the restore operation has completed. If `false`, the request returns a response when the restore operation initializes. If `true`, the request returns a response when the restore operation completes. Defaults to `false`. | + +1The cluster state includes: +* Persistent cluster settings +* Index templates +* Legacy index templates +* Ingest pipelines +* Index lifecycle policies + +#### Example request + +The following request restores the `opendistro-reports-definitions` index from `my-first-snapshot`. The `rename_pattern` and `rename_replacement` combination causes the index to be renamed to `opendistro-reports-definitions_restored` because duplicate open index names in a cluster are not allowed. + +````json +POST /_snapshot/my-opensearch-repo/my-first-snapshot/_restore +{ + "indices": "opendistro-reports-definitions", + "ignore_unavailable": true, + "include_global_state": false, + "rename_pattern": "(.+)", + "rename_replacement": "$1_restored", + "include_aliases": false +} +```` + +#### Example response + +Upon success, the response returns the following JSON object: + +````json +{ + "snapshot" : { + "snapshot" : "my-first-snapshot", + "indices" : [ ], + "shards" : { + "total" : 0, + "failed" : 0, + "successful" : 0 + } + } +} +```` +Except for the snapshot name, all properties are empty or `0`. This is because any changes made to the volume after the snapshot was generated are lost. However, if you invoke the [Get snapshot]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot) API to examine the snapshot, a fully populated snapshot object is returned. + +## Response fields + +| Field | Data type | Description | +| :--- | :--- | :--- | +| snapshot | string | Snapshot name. | +| indices | array | Indices in the snapshot. | +| shards | object | Total number of shards created along with number of successful and failed shards. | + +If open indices in a snapshot already exist in a cluster, and you don't delete, close, or rename them, the API returns an error like the following: +{: .note} + +````json +{ + "error" : { + "root_cause" : [ + { + "type" : "snapshot_restore_exception", + "reason" : "[my-opensearch-repo:my-first-snapshot/dCK4Qth-TymRQ7Tu7Iga0g] cannot restore index [.opendistro-reports-definitions] because an open index with same name already exists in the cluster. Either close or delete the existing index or restore the index under a different name by providing a rename pattern and replacement name" + } + ], + "type" : "snapshot_restore_exception", + "reason" : "[my-opensearch-repo:my-first-snapshot/dCK4Qth-TymRQ7Tu7Iga0g] cannot restore index [.opendistro-reports-definitions] because an open index with same name already exists in the cluster. Either close or delete the existing index or restore the index under a different name by providing a rename pattern and replacement name" + }, + "status" : 500 +} +```` \ No newline at end of file diff --git a/_api-reference/snapshots/verify-snapshot-repository.md b/_api-reference/snapshots/verify-snapshot-repository.md new file mode 100644 index 00000000..29299524 --- /dev/null +++ b/_api-reference/snapshots/verify-snapshot-repository.md @@ -0,0 +1,77 @@ +--- +layout: default +title: Verify Snaphot Repository +parent: Snapshot APIs + +nav_order: 4 +--- + +# Verify snapshot repository +**Introduced 1.0** +{: .label .label-purple } + +Verifies that a snapshot repository is functional. Verifies the repository on each node in a cluster. + +If verification is successful, the verify snapshot repository API returns a list of nodes connected to the snapshot repository. If verification failed, the API returns an error. + +If you use the Security plugin, you must have the `manage cluster` privilege. +{: .note} + +## Path parameters + +Path parameters are optional. + +| Parameter | Data type | Description | +:--- | :--- | :--- +| repository | String | Name of repository to verify. | + +## Query parameters + +| Parameter | Data type | Description | +:--- | :--- | :--- +| cluster_manager_timeout | Time | Amount of time to wait for a connection to the master node. Optional, defaults to `30s`. | +| timeout | Time | The period of time to wait for a response. If a response is not received before the timeout value, the request fails and returns an error. Defaults to `30s`. | + +#### Example request + +The following request verifies that the my-opensearch-repo is functional: + +````json +POST /_snapshot/my-opensearch-repo/_verify?timeout=0s&cluster_manager_timeout=50s +```` + +#### Example response + +The example that follows corresponds to the request above in the [Example request](#example-request) section. + +The `POST /_snapshot/my-opensearch-repo/_verify?timeout=0s&cluster_manager_timeout=50s` request returns the following fields: + +````json +{ + "nodes" : { + "by1kztwTRoeCyg4iGU5Y8A" : { + "name" : "opensearch-node1" + } + } +} +```` + +In the preceding sample, one node is connected to the snapshot repository. If more were connected, you would see them in the response. Example: + +````json +{ + "nodes" : { + "lcfL6jv2jo6sMEtp4idMvg" : { + "name" : "node-1" + }, + "rEPtFT/B+cuuOHnQn0jy4s" : { + "name" : "node-2" + } +} +```` + +## Response fields + +| Field | Data type | Description | +:--- | :--- | :--- +| nodes | Object | A list (not an array) of nodes connected to the snapshot repository. Each node itself is a property where the node ID is the key and the name has an ID (Object) and a name (String). | \ No newline at end of file diff --git a/_opensearch/rest-api/tasks.md b/_api-reference/tasks.md similarity index 67% rename from _opensearch/rest-api/tasks.md rename to _api-reference/tasks.md index 5702725c..5c3a41fd 100644 --- a/_opensearch/rest-api/tasks.md +++ b/_api-reference/tasks.md @@ -1,12 +1,13 @@ --- layout: default title: Tasks -parent: REST API reference -nav_order: 80 +nav_order: 85 +redirect_from: + - /opensearch/rest-api/tasks/ --- # Tasks -Introduced 1.0 +**Introduced 1.0** {: .label .label-purple } A task is any operation you run in a cluster. For example, searching your data collection of books for a title or author name is a task. When you run OpenSearch, a task is automatically created to monitor your cluster's health and performance. For more information about all of the tasks currently executing in your cluster, you can use the `tasks` API operation. @@ -16,16 +17,18 @@ The following request returns information about all of your tasks: ``` GET _tasks ``` +{% include copy-curl.html %} By including a task ID, you can get information specific to a particular task. Note that a task ID consists of a node's identifying string and the task's numerical ID. For example, if your node's identifying string is `nodestring` and the task's numerical ID is `1234`, then your task ID is `nodestring:1234`. You can find this information by running the `tasks` operation: ``` GET _tasks/ ``` +{% include copy-curl.html %} Note that if a task finishes running, it won't be returned as part of your request. For an example of a task that takes a little longer to finish, you can run the [`_reindex`]({{site.url}}{{site.baseurl}}/opensearch/reindex-data) API operation on a larger document, and then run `tasks`. -**Sample Response** +#### Example response ```json { "nodes": { @@ -50,6 +53,7 @@ Note that if a task finishes running, it won't be returned as part of your reque "running_time_in_nanos": 994000, "cancellable": false, "headers": {} + } }, "Mgqdm0r9SEGClWxp_RbnaQ:17413": { "node": "Mgqdm0r9SEGClWxp_RbnaQ", @@ -77,6 +81,7 @@ Note that if a task finishes running, it won't be returned as part of your reque } } ``` + You can also use the following parameters with your query. Parameter | Data type | Description | @@ -88,17 +93,18 @@ Parameter | Data type | Description | `wait_for_completion` | Boolean | Waits for the matching tasks to complete. (Default: false) `group_by` | Enum | Groups tasks by parent/child relationships or nodes. (Default: nodes) `timeout` | Time | An explicit operation timeout. (Default: 30 seconds) -`master_timeout` | Time | The time to wait for a connection to the primary node. (Default: 30 seconds) +`cluster_manager_timeout` | Time | The time to wait for a connection to the primary node. (Default: 30 seconds) For example, this request returns tasks currently running on a node named `opensearch-node1`: -**Sample Request** +#### Example request -``` +```json GET /_tasks?nodes=opensearch-node1 ``` +{% include copy-curl.html %} -**Sample Response** +#### Example response ```json { @@ -142,6 +148,93 @@ GET /_tasks?nodes=opensearch-node1 } ``` +The following request returns detailed information about active search tasks: + +#### Example request + +```bash +curl -XGET "localhost:9200/_tasks?actions=*search&detailed +``` +{% include copy.html %} + +#### Example response + +```json +{ + "nodes" : { + "CRqNwnEeRXOjeTSYYktw-A" : { + "name" : "runTask-0", + "transport_address" : "127.0.0.1:9300", + "host" : "127.0.0.1", + "ip" : "127.0.0.1:9300", + "roles" : [ + "cluster_manager", + "data", + "ingest", + "remote_cluster_client" + ], + "attributes" : { + "testattr" : "test", + "shard_indexing_pressure_enabled" : "true" + }, + "tasks" : { + "CRqNwnEeRXOjeTSYYktw-A:677" : { + "node" : "CRqNwnEeRXOjeTSYYktw-A", + "id" : 677, + "type" : "transport", + "action" : "indices:data/read/search", + "description" : "indices[], search_type[QUERY_THEN_FETCH], source[{\"query\":{\"query_string\":}}]", + "start_time_in_millis" : 1660106254525, + "running_time_in_nanos" : 1354236, + "cancellable" : true, + "cancelled" : false, + "headers" : { }, + "resource_stats" : { + "average" : { + "cpu_time_in_nanos" : 0, + "memory_in_bytes" : 0 + }, + "total" : { + "cpu_time_in_nanos" : 0, + "memory_in_bytes" : 0 + }, + "min" : { + "cpu_time_in_nanos" : 0, + "memory_in_bytes" : 0 + }, + "max" : { + "cpu_time_in_nanos" : 0, + "memory_in_bytes" : 0 + }, + "thread_info" : { + "thread_executions" : 0, + "active_threads" : 0 + } + } + } + } + } + } +} + +``` + +### The `resource_stats` object + +The `resource_stats` object is only updated for tasks that support resource tracking. These stats are computed based on scheduled thread executions, including both threads that have finished working on the task and threads currently working on the task. Because the same thread may be scheduled to work on the same task multiple times, each instance of a given thread being scheduled to work on a given task is considered to be a single thread execution. + +The following table lists all response fields in the `resource_stats` object. + +Response field | Description | +:--- | :--- | +`average` | The average resource usage across all scheduled thread executions. | +`total` | The sum of resource usages across all scheduled thread executions. | +`min` | The minimum resource usage across all scheduled thread executions. | +`max` | The maximum resource usage across all scheduled thread executions. | +`thread_info` | Thread-count-related stats.| +`thread_info.active_threads` | The number of threads currently working on the task. | +`thread_info.thread_executions` | The number of threads that have been scheduled to work on the task. | + ## Task canceling After getting a list of tasks, you can cancel all cancelable tasks with the following request: @@ -149,6 +242,7 @@ After getting a list of tasks, you can cancel all cancelable tasks with the foll ``` POST _tasks/_cancel ``` +{% include copy-curl.html %} Note that not all tasks are cancelable. To see if a task is cancelable, refer to the `cancellable` field in the response to your `tasks` API request. @@ -157,12 +251,14 @@ You can also cancel a task by including a specific task ID. ``` POST _tasks//_cancel ``` +{% include copy-curl.html %} The `cancel` operation supports the same parameters as the `tasks` operation. The following example shows how to cancel all cancelable tasks on multiple nodes. ``` POST _tasks/_cancel?nodes=opensearch-node1,opensearch-node2 ``` +{% include copy-curl.html %} ## Attaching headers to tasks @@ -171,8 +267,9 @@ To associate requests with tasks for better tracking, you can provide a `X-Opaqu Usage: ```bash -curl -i -H "X-Opaque-Id: 111111" "https://localhost:9200/_tasks" -u 'admin:admin' --insecure +curl -i -H "X-Opaque-Id: 111111" "https://localhost:9200/_tasks" -u 'admin:' --insecure ``` +{% include copy.html %} The `_tasks` operation returns the following result. @@ -229,5 +326,6 @@ content-length: 768 This operation supports the same parameters as the `tasks` operation. The following example shows how you can associate `X-Opaque-Id` with specific tasks: ```bash -curl -i -H "X-Opaque-Id: 123456" "https://localhost:9200/_tasks?nodes=opensearch-node1" -u 'admin:admin' --insecure +curl -i -H "X-Opaque-Id: 123456" "https://localhost:9200/_tasks?nodes=opensearch-node1" -u 'admin:' --insecure ``` +{% include copy.html %} diff --git a/_opensearch/units.md b/_api-reference/units.md similarity index 93% rename from _opensearch/units.md rename to _api-reference/units.md index 0e098cfe..8a0d2705 100644 --- a/_opensearch/units.md +++ b/_api-reference/units.md @@ -2,9 +2,13 @@ layout: default title: Supported units nav_order: 90 +redirect_from: + - /opensearch/units/ --- # Supported units +**Introduced 1.0** +{: .label .label-purple } OpenSearch supports the following units for all REST operations: diff --git a/_automating-configurations/api/create-workflow.md b/_automating-configurations/api/create-workflow.md new file mode 100644 index 00000000..93530541 --- /dev/null +++ b/_automating-configurations/api/create-workflow.md @@ -0,0 +1,255 @@ +--- +layout: default +title: Create or update a workflow +parent: Workflow APIs +nav_order: 10 +--- + +# Create or update a workflow + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +Creating a workflow adds the content of a workflow template to the flow framework system index. You can provide workflows in JSON format (by specifying `Content-Type: application/json`) or YAML format (by specifying `Content-Type: application/yaml`). By default, the workflow is validated to help identify invalid configurations, including: + +* Workflow steps requiring an OpenSearch plugin that is not installed. +* Workflow steps relying on previous node input that is provided by those steps. +* Workflow step fields with invalid values. +* Workflow graph (node/edge) configurations containing cycles or with duplicate IDs. + +To obtain the validation template for workflow steps, call the [Get Workflow Steps API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-steps/). + +Once a workflow is created, provide its `workflow_id` to other APIs. + +The `POST` method creates a new workflow. The `PUT` method updates an existing workflow. + +You can only update a workflow if it has not yet been provisioned. +{: .note} + +## Path and HTTP methods + +```json +POST /_plugins/_flow_framework/workflow +PUT /_plugins/_flow_framework/workflow/ +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow to be updated. Required for the `PUT` method. | + +## Query parameters + +Workflows are normally created and provisioned in separate steps. However, once you have thoroughly tested the workflow, you can combine the create and provision steps by including the `provision` query parameter: + +```json +POST /_plugins/_flow_framework/workflow?provision=true +``` +{% include copy-curl.html %} + +When set to `true`, the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) is executed immediately following creation. + +By default, workflows are validated when they are created to ensure that the syntax is valid and that the graph does not contain cycles. This behavior can be controlled with the `validation` query parameter. If `validation` is set to `all`, OpenSearch performs a complete template validation. Any other value of the `validation` parameter suppresses validation, allowing an incomplete/work-in-progress template to be saved. To disable template validation, set `validation` to `none`: + +```json +POST /_plugins/_flow_framework/workflow?validation=none +``` +{% include copy-curl.html %} + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `provision` | Boolean | Whether to provision the workflow as part of the request. Default is `false`. | +| `validation` | String | Whether to validate the workflow. Valid values are `all` (validate the template) and `none` (do not validate the template). Default is `all`. | + +## Request fields + +The following table lists the available request fields. + +|Field |Data type |Required/Optional |Description | +|:--- |:--- |:--- |:--- | +|`name` |String |Required |The name of the workflow. | +|`description` |String |Optional |A description of the workflow. | +|`use_case` |String |Optional | A use case, which can be used with the Search Workflow API to find related workflows. In the future, OpenSearch may provide some standard use cases to ease categorization, but currently you can use this field to specify custom values. | +|`version` |Object |Optional | A key-value map with two fields: `template`, which identifies the template version, and `compatibility`, which identifies a list of minimum required OpenSearch versions. | +|`workflows` |Object |Optional |A map of workflows. Presently, only the `provision` key is supported. The value for the workflow key is a key-value map that includes fields for `user_params` and lists of `nodes` and `edges`. | + +#### Example request: Register and deploy an externally hosted model (YAML) + +To provide a template in YAML format, specify `Content-Type: application/yaml` in the request header: + +```bash +curl -XPOST "http://localhost:9200/_plugins/_flow_framework/workflow" -H 'Content-Type: application/yaml' +``` + +YAML templates permit comments. +{: .tip} + +The following is an example YAML template for registering and deploying an externally hosted model: + +```yaml +# This name is required +name: createconnector-registerremotemodel-deploymodel +# Other fields are optional but useful +description: This template creates a connector to a remote model, registers it, and + deploys that model +# Other templates with a similar use case can be searched +use_case: REMOTE_MODEL_DEPLOYMENT +version: + # Templates may be versioned by their authors + template: 1.0.0 + # Compatibility with OpenSearch 2.12.0 and higher and 3.0.0 and higher + compatibility: + - 2.12.0 + - 3.0.0 +# One or more workflows can be included, presently only provision is supported +workflows: + provision: + # These nodes are the workflow steps corresponding to ML Commons APIs + nodes: + # This ID must be unique to this workflow + - id: create_connector_1 + # There may be multiple steps with the same type + type: create_connector + # These inputs match the Create Connector API body + user_inputs: + name: OpenAI Chat Connector + description: The connector to public OpenAI model service for GPT 3.5 + version: '1' + protocol: http + parameters: + endpoint: api.openai.com + model: gpt-3.5-turbo + credential: + openAI_key: '12345' + actions: + - action_type: predict + method: POST + url: https://${parameters.endpoint}/v1/chat/completions + # This ID must be unique to this workflow + - id: register_model_2 + type: register_remote_model + # This step needs the connector_id produced as an output of the previous step + previous_node_inputs: + create_connector_1: connector_id + # These inputs match the Register Model API body + user_inputs: + name: openAI-gpt-3.5-turbo + function_name: remote + description: test model + # This ID must be unique to this workflow + - id: deploy_model_3 + type: deploy_model + # This step needs the model_id produced as an output of the previous step + previous_node_inputs: + register_model_2: model_id + # Since the nodes include previous_node_inputs these are optional to define + # They will be added automatically and included in the stored template + # Additional edges may also be added here if required for sequencing + edges: + - source: create_connector_1 + dest: register_model_2 + - source: register_model_2 + dest: deploy_model_3 +``` +{% include copy-curl.html %} + +#### Example request: Register and deploy a remote model (JSON) + +To provide a template in JSON format, specify `Content-Type: application/json` in the request header: + +```bash +curl -XPOST "http://localhost:9200/_plugins/_flow_framework/workflow" -H 'Content-Type: application/json' +``` +The following JSON template is equivalent to the YAML template provided in the previous section: + +```json +{ + "name": "createconnector-registerremotemodel-deploymodel", + "description": "This template creates a connector to a remote model, registers it, and deploys that model", + "use_case": "REMOTE_MODEL_DEPLOYMENT", + "version": { + "template": "1.0.0", + "compatibility": [ + "2.12.0", + "3.0.0" + ] + }, + "workflows": { + "provision": { + "nodes": [ + { + "id": "create_connector_1", + "type": "create_connector", + "user_inputs": { + "name": "OpenAI Chat Connector", + "description": "The connector to public OpenAI model service for GPT 3.5", + "version": "1", + "protocol": "http", + "parameters": { + "endpoint": "api.openai.com", + "model": "gpt-3.5-turbo" + }, + "credential": { + "openAI_key": "12345" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://${parameters.endpoint}/v1/chat/completions" + } + ] + } + }, + { + "id": "register_model_2", + "type": "register_remote_model", + "previous_node_inputs": { + "create_connector_1": "connector_id" + }, + "user_inputs": { + "name": "openAI-gpt-3.5-turbo", + "function_name": "remote", + "description": "test model" + } + }, + { + "id": "deploy_model_3", + "type": "deploy_model", + "previous_node_inputs": { + "register_model_2": "model_id" + } + } + ], + "edges": [ + { + "source": "create_connector_1", + "dest": "register_model_2" + }, + { + "source": "register_model_2", + "dest": "deploy_model_3" + } + ] + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +OpenSearch responds with the `workflow_id`: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50" +} +``` + +Once you have created a workflow, you can use other workflow APIs with the `workflow_id`. \ No newline at end of file diff --git a/_automating-configurations/api/delete-workflow.md b/_automating-configurations/api/delete-workflow.md new file mode 100644 index 00000000..c1cee296 --- /dev/null +++ b/_automating-configurations/api/delete-workflow.md @@ -0,0 +1,56 @@ +--- +layout: default +title: Delete a workflow +parent: Workflow APIs +nav_order: 80 +--- + +# Delete a workflow + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +When you no longer need a workflow template, you can delete it by calling the Delete Workflow API. + +Note that deleting a workflow only deletes the stored template but does not deprovision its resources. + +## Path and HTTP methods + +```json +DELETE /_plugins/_flow_framework/workflow/ +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow to be retrieved. Required. | + +#### Example request + +``` +DELETE /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50 +``` +{% include copy-curl.html %} + +#### Example response + +If the workflow exists, a delete response contains the status of the deletion, where the `result` field is set to `deleted` on success or `not_found` if the workflow does not exist (it may have already been deleted): + +```json +{ + "_index": ".plugins-flow_framework-templates", + "_id": "8xL8bowB8y25Tqfenm50", + "_version": 2, + "result": "deleted", + "_shards": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "_seq_no": 2, + "_primary_term": 1 +} +``` \ No newline at end of file diff --git a/_automating-configurations/api/deprovision-workflow.md b/_automating-configurations/api/deprovision-workflow.md new file mode 100644 index 00000000..cdd85ef4 --- /dev/null +++ b/_automating-configurations/api/deprovision-workflow.md @@ -0,0 +1,61 @@ +--- +layout: default +title: Deprovision a workflow +parent: Workflow APIs +nav_order: 70 +--- + +# Deprovision a workflow + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +When you no longer need a workflow, you can deprovision its resources. Most workflow steps that create a resource have corresponding workflow steps to reverse that action. To retrieve all resources currently created for a workflow, call the [Get Workflow Status API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). When you call the Deprovision Workflow API, resources included in the `resources_created` field of the Get Workflow Status API response will be removed using a workflow step corresponding to the one that provisioned them. + +The workflow executes the provisioning workflow steps in reverse order. If failures occur because of resource dependencies, such as preventing deletion of a registered model if it is still deployed, the workflow attempts retries. + +## Path and HTTP methods + +```json +POST /_plugins/_flow_framework/workflow//_deprovision +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow to be deprovisioned. Required. | + +### Example request + +```json +POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_deprovision +``` +{% include copy-curl.html %} + +### Example response + +If deprovisioning is successful, OpenSearch responds with the same `workflow_id` that was used in the request: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50" +} +``` + +If deprovisioning did not completely remove all resources, OpenSearch responds with a `202 (ACCEPTED)` status and identifies the resources that were not deprovisioned: + +```json +{ + "error": "Failed to deprovision some resources: [connector_id Lw7PX4wBfVtHp98y06wV]." +} +``` + +In some cases, the failure happens because of another dependent resource that took some time to be removed. In this case, you can attempt to send the same request again. +{: .tip} + +To obtain a more detailed deprovisioning status than is provided by the summary in the error response, query the [Get Workflow Status API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). + +On success, the workflow returns to a `NOT_STARTED` state. If some resources have not yet been removed, they are provided in the response. \ No newline at end of file diff --git a/_automating-configurations/api/get-workflow-status.md b/_automating-configurations/api/get-workflow-status.md new file mode 100644 index 00000000..03870af1 --- /dev/null +++ b/_automating-configurations/api/get-workflow-status.md @@ -0,0 +1,114 @@ +--- +layout: default +title: Get a workflow status +parent: Workflow APIs +nav_order: 40 +--- + +# Get a workflow status + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +[Provisioning a workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) may take a significant amount of time, particularly when the action is associated with OpenSearch indexing operations. The Get Workflow State API permits monitoring of the provisioning deployment status until it is complete. + +## Path and HTTP methods + +```json +GET /_plugins/_flow_framework/workflow//_status +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow from which to obtain the status. Required for the `PUT` method. | + +## Query parameters + +The `all` parameter specifies whether the response should return all fields. + +When set to `false` (the default), the response contains the following fields: + +- `workflow_id` +- any `error` state +- `state` +- a list of `resources_created` + +When set to `true`, the response contains the following additional fields: + +- `provisioning_progress` +- `provision_start_time` +- `provision_end_time` +- `user` +- `user_outputs` + +To receive all available fields in the response, set `all` to `true`: + +```json +GET /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_status?all=true +``` +{% include copy-curl.html %} + +#### Example request + +```json +GET /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_status +``` +{% include copy-curl.html %} + + +#### Example response + +OpenSearch responds with a summary of the provisioning status and a list of created resources. + +Before provisioning has begun, OpenSearch does not return any resources: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50", + "state": "NOT_STARTED" +} +``` + +While provisioning is in progress, OpenSearch returns a partial resource list: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50", + "state": "PROVISIONING", + "resources_created": [ + { + "workflow_step_name": "create_connector", + "workflow_step_id": "create_connector_1", + "resource_type": "connector_id", + "resource_id": "NdjCQYwBLmvn802B0IwE" + } + ] +} +``` + +Upon provisioning completion, OpenSearch returns the full resource list: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50", + "state": "COMPLETED", + "resources_created": [ + { + "workflow_step_name": "create_connector", + "workflow_step_id": "create_connector_1", + "resource_type": "connector_id", + "resource_id": "NdjCQYwBLmvn802B0IwE" + }, + { + "workflow_step_name": "register_remote_model", + "workflow_step_id": "register_model_2", + "resource_type": "model_id", + "resource_id": "N9jCQYwBLmvn802B0oyh" + } + ] +} +``` \ No newline at end of file diff --git a/_automating-configurations/api/get-workflow-steps.md b/_automating-configurations/api/get-workflow-steps.md new file mode 100644 index 00000000..b4859da7 --- /dev/null +++ b/_automating-configurations/api/get-workflow-steps.md @@ -0,0 +1,63 @@ +--- +layout: default +title: Get workflow steps +parent: Workflow APIs +nav_order: 50 +--- + +# Get workflow steps + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +OpenSearch validates workflows by using the validation template that lists the required inputs, generated outputs, and required plugins for all steps. For example, for the `register_remote_model` step, the validation template appears as follows: + +```json +{ + "register_remote_model": { + "inputs": [ + "name", + "connector_id" + ], + "outputs": [ + "model_id", + "register_model_status" + ], + "required_plugins": [ + "opensearch-ml" + ] + } +} +``` + +The Get Workflow Steps API retrieves this file. + +## Path and HTTP methods + +```json +GET /_plugins/_flow_framework/workflow/_steps +``` + +#### Example request + +```json +GET /_plugins/_flow_framework/workflow/_steps +``` +{% include copy-curl.html %} + + +#### Example response + +OpenSearch responds with the validation template containing the steps. The order of fields in the returned steps may not exactly match the original JSON but will function identically. + +To retrieve the template in YAML format, specify `Content-Type: application/yaml` in the request header: + +```bash +curl -XGET "http://localhost:9200/_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50" -H 'Content-Type: application/yaml' +``` + +To retrieve the template in JSON format, specify `Content-Type: application/json` in the request header: + +```bash +curl -XGET "http://localhost:9200/_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50" -H 'Content-Type: application/json' +``` \ No newline at end of file diff --git a/_automating-configurations/api/get-workflow.md b/_automating-configurations/api/get-workflow.md new file mode 100644 index 00000000..b49858ff --- /dev/null +++ b/_automating-configurations/api/get-workflow.md @@ -0,0 +1,50 @@ +--- +layout: default +title: Get a workflow +parent: Workflow APIs +nav_order: 20 +--- + +# Get a workflow + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +The Get Workflow API retrieves the workflow template. + +## Path and HTTP methods + +```json +GET /_plugins/_flow_framework/workflow/ +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow to be retrieved. Required. | + +#### Example request + +```json +GET /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50 +``` +{% include copy-curl.html %} + +#### Example response + +To retrieve a template in YAML format, specify `Content-Type: application/yaml` in the request header: + +```bash +curl -XGET "http://localhost:9200/_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50" -H 'Content-Type: application/yaml' +``` + +To retrieve a template in JSON format, specify `Content-Type: application/json` in the request header: + +```bash +curl -XGET "http://localhost:9200/_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50" -H 'Content-Type: application/json' +``` + +OpenSearch responds with the stored template containing the same content as the body of the [create workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/create-workflow/) request. The order of fields in the returned template may not exactly match the original template but will function identically. \ No newline at end of file diff --git a/_automating-configurations/api/index.md b/_automating-configurations/api/index.md new file mode 100644 index 00000000..5fb05053 --- /dev/null +++ b/_automating-configurations/api/index.md @@ -0,0 +1,24 @@ +--- +layout: default +title: Workflow APIs +nav_order: 40 +has_children: true +has_toc: false +--- + +# Workflow APIs + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +OpenSearch supports the following workflow APIs: + +* [Create or update workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/create-workflow/) +* [Get workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow/) +* [Provision workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) +* [Get workflow status]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/) +* [Get workflow steps]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-steps/) +* [Search workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/search-workflow/) +* [Search workflow state]({{site.url}}{{site.baseurl}}/automating-configurations/api/search-workflow-state/) +* [Deprovision workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/deprovision-workflow/) +* [Delete workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/delete-workflow/) \ No newline at end of file diff --git a/_automating-configurations/api/provision-workflow.md b/_automating-configurations/api/provision-workflow.md new file mode 100644 index 00000000..5d2b5936 --- /dev/null +++ b/_automating-configurations/api/provision-workflow.md @@ -0,0 +1,51 @@ +--- +layout: default +title: Provision a workflow +parent: Workflow APIs +nav_order: 30 +--- + +# Provision a workflow + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +Provisioning a workflow is a one-time setup process usually performed by a cluster administrator to create resources that will be used by end users. + +The `workflows` template field may contain multiple workflows. The workflow with the `provision` key can be executed with this API. This API is also executed when the [Create or Update Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/create-workflow/) is called with the `provision` parameter set to `true`. + +You can only provision a workflow if it has not yet been provisioned. Deprovision the workflow if you need to repeat provisioning. +{: .note} + +## Path and HTTP methods + +```json +POST /_plugins/_flow_framework/workflow//_provision +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `workflow_id` | String | The ID of the workflow to be provisioned. Required. | + +#### Example request + +```json +POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision +``` +{% include copy-curl.html %} + +#### Example response + +OpenSearch responds with the same `workflow_id` that was used in the request: + +```json +{ + "workflow_id" : "8xL8bowB8y25Tqfenm50" +} +``` + +To obtain the provisioning status, query the [Get Workflow State API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). \ No newline at end of file diff --git a/_automating-configurations/api/search-workflow-state.md b/_automating-configurations/api/search-workflow-state.md new file mode 100644 index 00000000..9e21f143 --- /dev/null +++ b/_automating-configurations/api/search-workflow-state.md @@ -0,0 +1,63 @@ +--- +layout: default +title: Search for a workflow state +parent: Workflow APIs +nav_order: 65 +--- + +# Search for a workflow + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +You can search for resources created by workflows by matching a query to a field. The fields you can search correspond to those returned by the [Get Workflow Status API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). + +## Path and HTTP methods + +```json +GET /_plugins/_flow_framework/workflow/state/_search +POST /_plugins/_flow_framework/workflow/state/_search +``` + +#### Example request: All workflows with a state of `NOT_STARTED` + +```json +GET /_plugins/_flow_framework/workflow/state/_search +{ + "query": { + "match": { + "state": "NOT_STARTED" + } + } +} +``` +{% include copy-curl.html %} + +#### Example request: All workflows that have a `resources_created` field with a `workflow_step_id` of `register_model_2` + +```json +GET /_plugins/_flow_framework/workflow/state/_search +{ + "query": { + "nested": { + "path": "resources_created", + "query": { + "bool": { + "must": [ + { + "match": { + "resources_created.workflow_step_id": "register_model_2" + } + } + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +The response contains documents matching the search parameters. \ No newline at end of file diff --git a/_automating-configurations/api/search-workflow.md b/_automating-configurations/api/search-workflow.md new file mode 100644 index 00000000..7eb8890f --- /dev/null +++ b/_automating-configurations/api/search-workflow.md @@ -0,0 +1,50 @@ +--- +layout: default +title: Search for a workflow +parent: Workflow APIs +nav_order: 60 +--- + +# Search for a workflow + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +You can retrieve created workflows with their `workflow_id` or search for workflows by using a query matching a field. You can use the `use_case` field to search for similar workflows. + +## Path and HTTP methods + +```json +GET /_plugins/_flow_framework/workflow/_search +POST /_plugins/_flow_framework/workflow/_search +``` + +#### Example request: All created workflows + +```json +GET /_plugins/_flow_framework/workflow/_search +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} + +#### Example request: All workflows with a `use_case` of `REMOTE_MODEL_DEPLOYMENT` + +```json +GET /_plugins/_flow_framework/workflow/_search +{ + "query": { + "match": { + "use_case": "REMOTE_MODEL_DEPLOYMENT" + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +OpenSearch responds with a list of workflow templates matching the search parameters. \ No newline at end of file diff --git a/_automating-configurations/index.md b/_automating-configurations/index.md new file mode 100644 index 00000000..2b9ffdcf --- /dev/null +++ b/_automating-configurations/index.md @@ -0,0 +1,49 @@ +--- +layout: default +title: Automating configurations +nav_order: 1 +has_children: false +nav_exclude: true +redirect_from: /automating-configurations/ +--- + +# Automating configurations +**Introduced 2.12** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +You can automate complex OpenSearch setup and preprocessing tasks by providing templates for common use cases. For example, automating machine learning (ML) setup tasks streamlines the use of OpenSearch ML offerings. + +In OpenSearch 2.12, configuration automation is limited to ML tasks. +{: .info} + +OpenSearch use case templates provide a compact description of the setup process in a JSON or YAML document. These templates describe automated workflow configurations for conversational chat or query generation, AI connectors, tools, agents, and other components that prepare OpenSearch as a backend for generative models. For template examples, see [Sample templates](https://github.com/opensearch-project/flow-framework/tree/main/sample-templates). + +## Key features + +Workflow automation provides the following benefits: + +* **Use case templates**: Get started with predefined templates that outline the setup process for your general use cases. +* **Customizable workflows**: Customize the workflow templates to your specific use case. +* **Setup automation**: Easily configure AI connectors, tools, agents, and other components in a single API call. + +## Overview + +**Templates** implement workflow automation in OpenSearch. You can provide these templates in JSON or YAML format. You can describe one or more templates with a sequence of steps required for a particular use case. Each template consists of the following elements: + +* **Metadata**: A name, description, use case category, template version, and OpenSearch version compatibility range. +* **User input**: Parameters expected from the user that are common to all automation steps across all workflows, such as an index name. +* **Workflows**: One or more workflows containing the following elements: + * **User input**: Parameters expected from the user that are specific to the steps in this workflow. + * **Workflow Steps**: The workflow steps described as a directed acyclic graph (DAG): + * ***Nodes*** describe steps of the process, which may be executed in parallel. For the syntax of workflow steps, see [Workflow steps]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-steps/). + * ***Edges*** sequence nodes to be executed after the previous step is complete and may use the output fields of previous steps. When a node includes a key in the `previous_node_input` map referring to a previous node’s workflow step, a corresponding edge is automatically added to the template during parsing and may be omitted for the sake of simplicity. + +## Next steps + +- For supported APIs, see [Workflow APIs]({{site.url}}{{site.baseurl}}/automating-configurations/api/index/). +- For the workflow step syntax, see [Workflow steps]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-steps/). +- For a complete example, see [Workflow tutorial]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-tutorial/). +- For configurable settings, see [Workflow settings]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-settings/). diff --git a/_automating-configurations/workflow-settings.md b/_automating-configurations/workflow-settings.md new file mode 100644 index 00000000..f3138d0d --- /dev/null +++ b/_automating-configurations/workflow-settings.md @@ -0,0 +1,20 @@ +--- +layout: default +title: Workflow settings +nav_order: 30 +--- + +# Workflow settings + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +The following keys represent configurable workflow settings. + +|Setting |Data type |Default value |Description | +|:--- |:--- |:--- |:--- | +|`plugins.flow_framework.enabled` |Boolean |`false` |Whether the Flow Framework API is enabled. | +|`plugins.flow_framework.max_workflows` |Integer |`1000` | The maximum number of workflows that you can create. When the limit is above 1,000, the number of existing workflows is defined as a lower bound for performance reasons, so the actual maximum may slightly exceed this value. | +|`plugins.flow_framework.max_workflow_steps` |Integer |`50` |The maximum number of steps a workflow can have. | +|`plugins.flow_framework.request_timeout` |Time units |`10s` |The default timeout for REST requests, which applies to internal search queries. | +|`plugins.flow_framework.task_request_retry_duration` |Time units |`5s` | When steps correspond to an API that produces a `task_id`, OpenSearch will retry them at this interval until completion. | diff --git a/_automating-configurations/workflow-steps.md b/_automating-configurations/workflow-steps.md new file mode 100644 index 00000000..8565ccc2 --- /dev/null +++ b/_automating-configurations/workflow-steps.md @@ -0,0 +1,64 @@ +--- +layout: default +title: Workflow steps +nav_order: 10 +--- + +# Workflow steps + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +_Workflow steps_ form basic "building blocks" for process automation. Most steps directly correspond to OpenSearch or plugin API operations, such as CRUD operations on machine learning (ML) connectors, models, and agents. Some steps simplify the configuration by reusing the body expected by these APIs across multiple steps. For example, once you configure a _tool_, you can use it with multiple _agents_. + +## Workflow step fields + +Workflow steps are actively being developed to expand automation capabilities. Workflow step (graph node) configuration includes the following fields. + +|Field |Data type |Required/Optional |Description | +|:--- |:--- |:--- |:--- | +|`id` |String |Required | A user-provided ID for the step. The ID must be unique within a given workflow and is useful for identifying resources created by the step. For example, a `register_agent` step may return an `agent_id` that has been registered. Using this ID, you can determine which step produced which resource. | +|`type` |String |Required |The type of action to take, such as `deploy_model`, which corresponds to the API for which the step is used. Multiple steps may share the same type but must each have their own unique ID. For a list of supported types, see [Workflow step types](#workflow-step-types). | +|`previous_node_inputs` |Object |Optional | A key-value map specifying user inputs that are produced by a previous step in the workflow. For each key-value pair, the key is the previous step's `id` and the value is an API body field name (such as `model_id`) that will be produced as an output of a previous step in the workflow. For example, `register_remote_model` (key) may produce a `model_id` (value) that is required for a subsequent `deploy_model` step.
A graph edge is automatically added to the workflow connecting the previous step's key as the source and the current node as the destination.
In some cases, you can include [additional inputs](#additional-fields) in this field. | +|`user_inputs` |Object |Optional | A key-value map of inputs supported by the corresponding API for this specific step. Some inputs are required for an API, while others are optional. Required inputs may be specified here, if known, or in the `previous_node_inputs` field. The [Get Workflow Steps API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-steps/) identifies required inputs and step outputs.
Substitutions are supported in string values, lists of strings, and maps with string values. The pattern `{% raw %}${{previous_step_id.output_key}}{% endraw %}` will be replaced by the value in the previous step's output with the given key. For example, if a parameter map in the user inputs includes a key `embedding_model_id` with a value `{% raw %}${{deploy_embedding_model.model_id}}{% endraw %}`, then the `model_id` output of the `deploy_embedding_model` step will be substituted here. This performs a similar function to the `previous_node_input` map but is not validated and does not automatically infer edges.
In some cases, you can include [additional inputs](#additional-fields) in this field. | + +## Workflow step types + +The following table lists the workflow step types. The `user_inputs` fields for these steps correspond directly to the linked APIs. + +|Step type |Corresponding API |Description | +|--- |--- |--- | +|`noop` |No API | A no-operation (no-op) step that does nothing. It may be useful in some cases for synchronizing parallel steps. | +|`create_connector` |[Create Connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/create-connector/) |Creates a connector to a model hosted on a third-party platform. | +|`delete_connector` |[Delete Connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/delete-connector/) |Deletes a connector to a model hosted on a third-party platform. | +|`register_model_group` |[Register Model Group]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/register-model-group/) |Registers a model group. The model group will be deleted automatically once no model is present in the group. | +|`register_remote_model` |[Register Model (remote)]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#register-a-model-hosted-on-a-third-party-platform) |Registers a model hosted on a third-party platform. If the `user_inputs` field contains a `deploy` key that is set to `true`, also deploys the model. | +|`register_local_pretrained_model` |[Register Model (pretrained)]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#register-a-pretrained-text-embedding-model) | Registers an OpenSearch-provided pretrained text embedding model that is hosted on your OpenSearch cluster. If the `user_inputs` field contains a `deploy` key that is set to `true`, also deploys the model. | +|`register_local_sparse_encoding_model` |[Register Model (sparse)]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#register-a-pretrained-sparse-encoding-model) | Registers an OpenSearch-provided pretrained sparse encoding model that is hosted on your OpenSearch cluster. If the `user_inputs` field contains a `deploy` key that is set to `true`, also deploys the model. | +|`register_local_custom_model` |[Register Model (custom)]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#register-a-custom-model) | Registers a custom model that is hosted on your OpenSearch cluster. If the `user_inputs` field contains a `deploy` key that is set to `true`, also deploys the model. | +|`delete_model` |[Delete Model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/delete-model/) |Unregisters and deletes a model. | +|`deploy_model` |[Deploy Model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/deploy-model/) |Deploys a registered model into memory. | +|`undeploy_model` |[Undeploy Model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/undeploy-model/) |Undeploys a deployed model from memory. | +|`register_agent` |[Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/) |Registers an agent as part of the ML Commons Agent Framework. | +|`delete_agent` |[Delete Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/) |Deletes an agent. | +|`create_tool` |No API | A special-case non-API step encapsulating the specification of a tool for an agent in the ML Commons Agent Framework. These will be listed as `previous_node_inputs` for the appropriate register agent step, with the value set to `tools`. | + +## Additional fields + +You can include the following additional fields in the `user_inputs` field when indicated. + +|Field |Data type |Description | +|--- |--- |--- | +|`node_timeout` |Time units |A user-provided timeout for this step. For example, `20s` for a 20-second timeout. | +|`deploy` |Boolean |Applicable to the Register Model step type. If set to `true`, also executes the Deploy Model step. | +|`tools_order` |List |Applicable only to the Register Agent step type. Specifies the ordering of `tools`. For example, specify `["foo_tool", "bar_tool"]` to sequence those tools in that order. | + +You can include the following additional fields in the `previous_node_inputs` field when indicated. + +|Field |Data type |Description | +|--- |--- |--- | +|`model_id` |String |The `model_id` is used as an input for several steps. As a special case for the Register Agent step type, if an `llm.model_id` field is not present in the `user_inputs` and not present in `previous_node_inputs`, the `model_id` field from the previous node may be used as a backup for the model ID. | + +## Example workflow steps + +For example workflow step implementations, see the [Workflow tutorial]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-tutorial/). \ No newline at end of file diff --git a/_automating-configurations/workflow-tutorial.md b/_automating-configurations/workflow-tutorial.md new file mode 100644 index 00000000..99d84501 --- /dev/null +++ b/_automating-configurations/workflow-tutorial.md @@ -0,0 +1,623 @@ +--- +layout: default +title: Workflow tutorial +nav_order: 20 +--- + +# Workflow tutorial + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/flow-framework/issues/475). +{: .warning} + +You can automate the setup of common use cases, such as conversational chat, using a Chain-of-Thought (CoT) agent. An _agent_ orchestrates and runs ML models and tools. A _tool_ performs a set of specific tasks. This page presents a complete example of setting up a CoT agent. For more information about agents and tools, see [Agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/) + +The setup requires the following sequence of API requests, with provisioned resources used in subsequent requests. The following list provides an overview of the steps required for this workflow. The step names correspond to the names in the template: + +1. **Deploy a model on the cluster** + * [`create_connector_1`](#create_connector_1): Create a connector to an externally hosted model. + * [`register_model_2`](#register_model_2): Register a model using the connector that you created. + * [`deploy_model_3`](#deploy_model_3): Deploy the model. +1. **Use the deployed model for inference** + * Set up several tools that perform specific tasks: + * [`cat_index_tool`](#cat_index_tool): Set up a tool to obtain index information. + * [`ml_model_tool`](#ml_model_tool): Set up a machine learning (ML) model tool. + * Set up one or more agents that use some combination of the tools: + * [`sub_agent`](#sub_agent): Create an agent that uses the `cat_index_tool`. + * Set up tools representing these agents: + * [`agent_tool`](#agent_tool): Wrap the `sub_agent` so that you can use it as a tool. + * [`root_agent`](#root_agent): Set up a root agent that may delegate the task to either a tool or another agent. + +The following sections describe the steps in detail. For the complete workflow template, see [Complete YAML workflow template](#complete-yaml-workflow-template). + +## Workflow graph + +The workflow described in the previous section is organized into a [template](#complete-yaml-workflow-template). Note that you can order the steps in several ways. In the example template, the `ml_model_tool` step is specified right before the `root_agent` step, but you can specify it at any point after the `deploy_model_3` step and before the `root_agent` step. The following diagram shows the directed acyclic graph (DAG) that OpenSearch creates for all of the steps in the order specified in the template. + +![Example workflow steps graph]({{site.url}}{{site.baseurl}}/images/automatic-workflow-dag.png){:style="width: 100%; max-width: 600px;" class="img-centered"} + +## 1. Deploy a model on the cluster + +To deploy a model on the cluster, you need to create a connector to the model, register the model, and deploy the model. + + +### create_connector_1 + + +The first step in the workflow is to create a connector to an externally hosted model (in the following example, this step is called `create_connector_1`). The content of the `user_inputs` field exactly matches the ML Commons [Create Connector API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/create-connector/): + +```yaml +nodes: +- id: create_connector_1 + type: create_connector + user_inputs: + name: OpenAI Chat Connector + description: The connector to public OpenAI model service for GPT 3.5 + version: '1' + protocol: http + parameters: + endpoint: api.openai.com + model: gpt-3.5-turbo + credential: + openAI_key: '12345' + actions: + - action_type: predict + method: POST + url: https://${parameters.endpoint}/v1/chat/completions +``` + +When you create a connector, OpenSearch returns a `connector_id`, which you need in order to register the model. + + +### register_model_2 + + +When registering a model, the `previous_node_inputs` field tells OpenSearch to obtain the required `connector_id` from the output of the `create_connector_1` step. Other inputs required by the [Register Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/) are included in the `user_inputs` field: + +```yaml +- id: register_model_2 + type: register_remote_model + previous_node_inputs: + create_connector_1: connector_id + user_inputs: + name: openAI-gpt-3.5-turbo + function_name: remote + description: test model +``` + +The output of this step is a `model_id`. You must then deploy the registered model to the cluster. + + +### deploy_model_3 + + +The [Deploy Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/deploy-model/) requires the `model_id` from the previous step, as specified in the `previous_node_inputs` field: + +```yaml +- id: deploy_model_3 + type: deploy_model + # This step needs the model_id produced as an output of the previous step + previous_node_inputs: + register_model_2: model_id +``` + +When using the Deploy Model API directly, a task ID is returned, requiring use of the [Tasks API](https://opensearch.org/docs/latest/ml-commons-plugin/api/tasks-apis/get-task/) to determine when the deployment is complete. The automated workflow eliminates the manual status check and returns the final `model_id` directly. + +### Ordering steps + +To order these steps in a sequence, you must connect them by an edge in the graph. When a `previous_node_input` field is present in a step, OpenSearch automatically creates a node with `source` and `dest` fields for this step. The output of the `source` is required as input for the `dest`. For example, the `register_model_2` step requires the `connector_id` from the `create_connector_1` step. Similarly, the `deploy_model_3` step requires the `model_id` from the `register_model_2` step. Thus, OpenSearch creates the first two edges in the graph as follows in order to match the output with the required input and raise errors if the required input is missing: + +```yaml +edges: +- source: create_connector_1 + dest: register_model_2 +- source: register_model_2 + dest: deploy_model_3 +``` + +If you define `previous_node_inputs`, then defining edges is optional. +{: .note} + +## 2. Use the deployed model for inference + +A CoT agent can use the deployed model in a tool. This step doesn’t strictly correspond to an API but represents a component of the body required by the [Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/). This simplifies the register request and allows reuse of the same tool in multiple agents. For more information about agents and tools, see [Agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/). + + +### cat_index_tool + + +You can configure other tools to be used by the CoT agent. For example, you can configure a `cat_index_tool` as follows. This tool does not depend on any previous steps: + +```yaml +- id: cat_index_tool + type: create_tool + user_inputs: + name: CatIndexTool + type: CatIndexTool + parameters: + max_iteration: 5 +``` + + +### sub_agent + + +To use the `cat_index_tool` in the agent configuration, specify it as one of the tools in the `previous_node_inputs` field of the agent. You can add other tools to `previous_node_inputs` as necessary. The agent also needs a large language model (LLM) in order to reason with the tools. The LLM is defined by the `llm.model_id` field. This example assumes that the `model_id` from the `deploy_model_3` step will be used. However, if another model is already deployed, the `model_id` of that previously deployed model could be included in the `user_inputs` field instead: + +```yaml +- id: sub_agent + type: register_agent + previous_node_inputs: + # When llm.model_id is not present this can be used as a fallback value + deploy-model-3: model_id + cat_index_tool: tools + user_inputs: + name: Sub Agent + type: conversational + description: this is a test agent + parameters: + hello: world + llm.parameters: + max_iteration: '5' + stop_when_no_tool_found: 'true' + memory: + type: conversation_index + app_type: chatbot +``` + +OpenSearch will automatically create the following edges so that the agent can retrieve the fields from the previous node: + +```yaml +- source: cat_index_tool + dest: sub_agent +- source: deploy_model_3 + dest: sub_agent +``` + + +### agent_tool + + +You can use an agent as a tool for another agent. Registering an agent produces an `agent_id` in the output. The following step defines a tool that uses the `agent_id` from the previous step: + +```yaml +- id: agent_tool + type: create_tool + previous_node_inputs: + sub_agent: agent_id + user_inputs: + name: AgentTool + type: AgentTool + description: Agent Tool + parameters: + max_iteration: 5 +``` + +OpenSearch automatically creates an edge connection because this step specifies the `previous_node_input`: + +```yaml +- source: sub_agent + dest: agent_tool +``` + + +### ml_model_tool + + +A tool may reference an ML model. This example gets the required `model_id` from the model deployed in a previous step: + +```yaml +- id: ml_model_tool + type: create_tool + previous_node_inputs: + deploy-model-3: model_id + user_inputs: + name: MLModelTool + type: MLModelTool + alias: language_model_tool + description: A general tool to answer any question. + parameters: + prompt: Answer the question as best you can. + response_filter: choices[0].message.content +``` + +OpenSearch automatically creates an edge in order to use the `previous_node_input`: + +```yaml +- source: deploy-model-3 + dest: ml_model_tool +``` + + +### root_agent + + +A conversational chat application will communicate with a single root agent that includes the ML model tool and the agent tool in its `tools` field. It will also obtain the `llm.model_id` from the deployed model. Some agents require tools to be in a specific order, which can be enforced by including the `tools_order` field in the user inputs: + +```yaml +- id: root_agent + type: register_agent + previous_node_inputs: + deploy-model-3: model_id + ml_model_tool: tools + agent_tool: tools + user_inputs: + name: DEMO-Test_Agent_For_CoT + type: conversational + description: this is a test agent + parameters: + prompt: Answer the question as best you can. + llm.parameters: + max_iteration: '5' + stop_when_no_tool_found: 'true' + tools_order: ['agent_tool', 'ml_model_tool'] + memory: + type: conversation_index + app_type: chatbot +``` + +OpenSearch automatically creates edges for the `previous_node_input` sources: + +```yaml +- source: deploy-model-3 + dest: root_agent +- source: ml_model_tool + dest: root_agent +- source: agent_tool + dest: root_agent +``` + +For the complete DAG that OpenSearch creates for this workflow, see the [workflow graph](#workflow-graph). + +## Complete YAML workflow template + +The following is the final template including all of the `provision` workflow steps in YAML format: + +
+ + YAML template + + {: .text-delta} + +```yaml +# This template demonstrates provisioning the resources for a +# Chain-of-Thought chat bot +name: tool-register-agent +description: test case +use_case: REGISTER_AGENT +version: + template: 1.0.0 + compatibility: + - 2.12.0 + - 3.0.0 +workflows: + # This workflow defines the actions to be taken when the Provision Workflow API is used + provision: + nodes: + # The first three nodes create a connector to a remote model, registers and deploy that model + - id: create_connector_1 + type: create_connector + user_inputs: + name: OpenAI Chat Connector + description: The connector to public OpenAI model service for GPT 3.5 + version: '1' + protocol: http + parameters: + endpoint: api.openai.com + model: gpt-3.5-turbo + credential: + openAI_key: '12345' + actions: + - action_type: predict + method: POST + url: https://${parameters.endpoint}/v1/chat/completions + - id: register_model_2 + type: register_remote_model + previous_node_inputs: + create_connector_1: connector_id + user_inputs: + # deploy: true could be added here instead of the deploy step below + name: openAI-gpt-3.5-turbo + description: test model + - id: deploy_model_3 + type: deploy_model + previous_node_inputs: + register_model_2: model_id + # For example purposes, the model_id obtained as the output of the deploy_model_3 step will be used + # for several below steps. However, any other deployed model_id can be used for those steps. + # This is one example tool from the Agent Framework. + - id: cat_index_tool + type: create_tool + user_inputs: + name: CatIndexTool + type: CatIndexTool + parameters: + max_iteration: 5 + # This simple agent only has one tool, but could be configured with many tools + - id: sub_agent + type: register_agent + previous_node_inputs: + deploy-model-3: model_id + cat_index_tool: tools + user_inputs: + name: Sub Agent + type: conversational + parameters: + hello: world + llm.parameters: + max_iteration: '5' + stop_when_no_tool_found: 'true' + memory: + type: conversation_index + app_type: chatbot + # An agent can be used itself as a tool in a nested relationship + - id: agent_tool + type: create_tool + previous_node_inputs: + sub_agent: agent_id + user_inputs: + name: AgentTool + type: AgentTool + parameters: + max_iteration: 5 + # An ML Model can be used as a tool + - id: ml_model_tool + type: create_tool + previous_node_inputs: + deploy-model-3: model_id + user_inputs: + name: MLModelTool + type: MLModelTool + alias: language_model_tool + parameters: + prompt: Answer the question as best you can. + response_filter: choices[0].message.content + # This final agent will be the interface for the CoT chat user + # Using a flow agent type tools_order matters + - id: root_agent + type: register_agent + previous_node_inputs: + deploy-model-3: model_id + ml_model_tool: tools + agent_tool: tools + user_inputs: + name: DEMO-Test_Agent + type: flow + parameters: + prompt: Answer the question as best you can. + llm.parameters: + max_iteration: '5' + stop_when_no_tool_found: 'true' + tools_order: ['agent_tool', 'ml_model_tool'] + memory: + type: conversation_index + app_type: chatbot + # These edges are all automatically created with previous_node_input + edges: + - source: create_connector_1 + dest: register_model_2 + - source: register_model_2 + dest: deploy_model_3 + - source: cat_index_tool + dest: sub_agent + - source: deploy_model_3 + dest: sub_agent + - source: sub_agent + dest: agent_tool + - source: deploy-model-3 + dest: ml_model_tool + - source: deploy-model-3 + dest: root_agent + - source: ml_model_tool + dest: root_agent + - source: agent_tool + dest: root_agent +``` +
+ +## Complete JSON workflow template + +The following is the same template in JSON format: + +
+ + JSON template + + {: .text-delta} + +```json +{ + "name": "tool-register-agent", + "description": "test case", + "use_case": "REGISTER_AGENT", + "version": { + "template": "1.0.0", + "compatibility": [ + "2.12.0", + "3.0.0" + ] + }, + "workflows": { + "provision": { + "nodes": [ + { + "id": "create_connector_1", + "type": "create_connector", + "user_inputs": { + "name": "OpenAI Chat Connector", + "description": "The connector to public OpenAI model service for GPT 3.5", + "version": "1", + "protocol": "http", + "parameters": { + "endpoint": "api.openai.com", + "model": "gpt-3.5-turbo" + }, + "credential": { + "openAI_key": "12345" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://${parameters.endpoint}/v1/chat/completions" + } + ] + } + }, + { + "id": "register_model_2", + "type": "register_remote_model", + "previous_node_inputs": { + "create_connector_1": "connector_id" + }, + "user_inputs": { + "name": "openAI-gpt-3.5-turbo", + "description": "test model" + } + }, + { + "id": "deploy_model_3", + "type": "deploy_model", + "previous_node_inputs": { + "register_model_2": "model_id" + } + }, + { + "id": "cat_index_tool", + "type": "create_tool", + "user_inputs": { + "name": "CatIndexTool", + "type": "CatIndexTool", + "parameters": { + "max_iteration": 5 + } + } + }, + { + "id": "sub_agent", + "type": "register_agent", + "previous_node_inputs": { + "deploy-model-3": "llm.model_id", + "cat_index_tool": "tools" + }, + "user_inputs": { + "name": "Sub Agent", + "type": "conversational", + "parameters": { + "hello": "world" + }, + "llm.parameters": { + "max_iteration": "5", + "stop_when_no_tool_found": "true" + }, + "memory": { + "type": "conversation_index" + }, + "app_type": "chatbot" + } + }, + { + "id": "agent_tool", + "type": "create_tool", + "previous_node_inputs": { + "sub_agent": "agent_id" + }, + "user_inputs": { + "name": "AgentTool", + "type": "AgentTool", + "parameters": { + "max_iteration": 5 + } + } + }, + { + "id": "ml_model_tool", + "type": "create_tool", + "previous_node_inputs": { + "deploy-model-3": "model_id" + }, + "user_inputs": { + "name": "MLModelTool", + "type": "MLModelTool", + "alias": "language_model_tool", + "parameters": { + "prompt": "Answer the question as best you can.", + "response_filter": "choices[0].message.content" + } + } + }, + { + "id": "root_agent", + "type": "register_agent", + "previous_node_inputs": { + "deploy-model-3": "llm.model_id", + "ml_model_tool": "tools", + "agent_tool": "tools" + }, + "user_inputs": { + "name": "DEMO-Test_Agent", + "type": "flow", + "parameters": { + "prompt": "Answer the question as best you can." + }, + "llm.parameters": { + "max_iteration": "5", + "stop_when_no_tool_found": "true" + }, + "tools_order": [ + "agent_tool", + "ml_model_tool" + ], + "memory": { + "type": "conversation_index" + }, + "app_type": "chatbot" + } + } + ], + "edges": [ + { + "source": "create_connector_1", + "dest": "register_model_2" + }, + { + "source": "register_model_2", + "dest": "deploy_model_3" + }, + { + "source": "cat_index_tool", + "dest": "sub_agent" + }, + { + "source": "deploy_model_3", + "dest": "sub_agent" + }, + { + "source": "sub_agent", + "dest": "agent_tool" + }, + { + "source": "deploy-model-3", + "dest": "ml_model_tool" + }, + { + "source": "deploy-model-3", + "dest": "root_agent" + }, + { + "source": "ml_model_tool", + "dest": "root_agent" + }, + { + "source": "agent_tool", + "dest": "root_agent" + } + ] + } + } +} +``` +
+ +## Next steps + +To learn more about agents and tools, see [Agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/). \ No newline at end of file diff --git a/_benchmark/index.md b/_benchmark/index.md new file mode 100644 index 00000000..25b3738e --- /dev/null +++ b/_benchmark/index.md @@ -0,0 +1,37 @@ +--- +layout: default +title: OpenSearch Benchmark +nav_order: 1 +has_children: false +nav_exclude: true +has_toc: false +permalink: /benchmark/ +redirect_from: + - /benchmark/index/ +--- + +# OpenSearch Benchmark + +OpenSearch Benchmark is a macrobenchmark utility provided by the [OpenSearch Project](https://github.com/opensearch-project). You can use OpenSearch Benchmark to gather performance metrics from an OpenSearch cluster for a variety of purposes, including: + +- Tracking the overall performance of an OpenSearch cluster. +- Informing decisions about when to upgrade your cluster to a new version. +- Determining how changes to your workflow---such as modifying mappings or queries---might impact your cluster. + +OpenSearch Benchmark can be installed directly on a compatible host running Linux or macOS. You can also run OpenSearch Benchmark in a Docker container. See [Installing OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/installing-benchmark/) for more information. + +The following diagram visualizes how OpenSearch Benchmark works when run against a local host: + +![Benchmark workflow]({{site.url}}{{site.baseurl}}/images/benchmark/OSB-workflow.png). + +The OpenSearch Benchmark documentation is split into five sections: + +- [Quickstart]({{site.url}}{{site.baseurl}}/benchmark/quickstart/): Learn how to quickly run and install OpenSearch Benchmark. +- [User guide]({{site.url}}{{site.baseurl}}/benchmark/user-guide/index/): Dive deep into how OpenSearch Benchmark can help you track the performance of your cluster. +- [Tutorials]({{site.url}}{{site.baseurl}}/benchmark/tutorials/index/): Use step-by-step guides for more advanced benchmarking configurations and functionality. +- [Commands]({{site.url}}{{site.baseurl}}/benchmark/commands/index/): A detailed reference of commands and command options supported by OpenSearch. +- [Workloads]({{site.url}}{{site.baseurl}}/benchmark/workloads/index/): A detailed reference of options available for both default and custom workloads. + + + + diff --git a/_benchmark/quickstart.md b/_benchmark/quickstart.md new file mode 100644 index 00000000..0c23f749 --- /dev/null +++ b/_benchmark/quickstart.md @@ -0,0 +1,283 @@ +--- +layout: default +title: Quickstart +nav_order: 2 +--- + +# OpenSearch Benchmark quickstart + +This tutorial outlines how to quickly install OpenSearch Benchmark and run your first OpenSearch Benchmark workload. + +## Prerequisites + +To perform the Quickstart steps, you'll need to fulfill the following prerequisites: + +- A currently active OpenSearch cluster. For instructions on how to create an OpenSearch cluster, see [Creating a cluster]({{site.url}}{{site.baseurl}}//tuning-your-cluster/index/). +- Git 2.3 or greater. +- Python 3.8 or later + +## Set up an OpenSearch cluster + +If you don't already have an active OpenSearch cluster, you can launch a new OpenSearch cluster to use with OpenSerch Benchmark. + +- Using **Docker Compose**. For instructions on how to use Docker Compose, see [OpenSearch Quickstart]({{site.url}}{{site.baseurl}}/quickstart/). +- Using **Tar**. For instructions on how to install OpenSearch with Tar, see [Installing OpenSearch > Tarball]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/tar#step-1-download-and-unpack-opensearch). + +OpenSearch Benchmark has not been tested with the Window's distribution of OpenSearch. +{: .note} + +After installation, you can verify OpenSearch is running by going to `localhost:9200`. If you're running your cluster with the Security plugin enabled, OpenSearch will expect SSL connections with the username "admin" and password "admin". However, since the localhost address is not a unique public address, no certificate authority will issue an SSL certificate for it, so certificate checking will need to be disabled using the `-k` option. + +Use the following command to verify OpenSearch is running with SSL certificate checks disabled: + +```bash +curl -k -u admin: https://localhost:9200 # the "-k" option skips SSL certificate checks + +{ + "name" : "147ddae31bf8.opensearch.org", + "cluster_name" : "opensearch", + "cluster_uuid" : "n10q2RirTIuhEJCiKMkpzw", + "version" : { + "distribution" : "opensearch", + "number" : "2.10.0", + "build_type" : "tar", + "build_hash" : "eee49cb340edc6c4d489bcd9324dda571fc8dc03", + "build_date" : "2023-09-20T23:54:29.889267151Z", + "build_snapshot" : false, + "lucene_version" : "9.7.0", + "minimum_wire_compatibility_version" : "7.10.0", + "minimum_index_compatibility_version" : "7.0.0" + }, + "tagline" : "The OpenSearch Project: https://opensearch.org/" +} +``` + +With your cluster running, you can now install OpenSearch Benchmark. + +## Installing OpenSearch Benchmark + +To install OpenSearch Benchmark with Docker, see [Installing OpenSearch Benchmark > Installing with Docker]({{site.url}}{{site.baseurl}}/benchmark/user-guide/installing-benchmark/#installing-with-docker). +{: .tip} + +To install OpenSearch Benchmark from PyPi, enter the following `pip` command: + +```bash +pip3 install opensearch-benchmark +``` +{% include copy.html %} + +After the installation completes, verify that OpenSearch Benchmark is running by entering the following command: + +```bash +opensearch-benchmark --help +``` + +If successful, OpenSearch returns the following response: + +```bash +$ opensearch-benchmark --help +usage: opensearch-benchmark [-h] [--version] {execute-test,list,info,create-workload,generate,compare,download,install,start,stop} ... + + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + + A benchmarking tool for OpenSearch + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit + +subcommands: + {execute-test,list,info,create-workload,generate,compare,download,install,start,stop} + execute-test Run a benchmark + list List configuration options + info Show info about a workload + create-workload Create a Benchmark workload from existing data + generate Generate artifacts + compare Compare two test_executions + download Downloads an artifact + install Installs an OpenSearch node locally + start Starts an OpenSearch node locally + stop Stops an OpenSearch node locally + +Find out more about Benchmark at https://opensearch.org/docs +``` + +## Running your first benchmark + +You can now run your first benchmark. The following benchmark uses the [percolator](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/percolator) workload. + + +### Understanding workload command flags + +Benchmarks are run using the [`execute-test`]({{site.url}}{{site.baseurl}}/benchmark/commands/execute-test/) command with the following command flags: + +For additional `execute_test` command flags, see the [execute-test]({{site.url}}{{site.baseurl}}/benchmark/commands/execute-test/) reference. Some commonly used options are `--workload-params`, `--exclude-tasks`, and `--include-tasks`. +{: .tip} + +* `--pipeline=benchmark-only` : Informs OSB that users wants to provide their own OpenSearch cluster. +- `workload=percolator`: The name of workload used by OpenSearch Benchmark. +* `--target-host=""`: Indicates the target cluster or host that will be benchmarked. Enter the endpoint of your OpenSearch cluster here. +* `--client-options="basic_auth_user:'',basic_auth_password:''"`: The username and password for your OpenSearch cluster. +* `--test-mode`: Allows a user to run the workload without running it for the entire duration. When this flag is present, Benchmark runs the first thousand operations of each task in the workload. This is only meant for sanity checks---the metrics produced are meaningless. + +The `--distribution-version`, which indicates which OpenSearch version Benchmark will use when provisioning. When run, the `execute-test` command will parse the correct distribution version when it connects to the OpenSearch cluster. + +### Running the workload + +To run the [percolator](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/percolator) workload with OpenSearch Benchmark, use the following `execute-test` command: + +```bash +opensearch-benchmark execute-test --pipeline=benchmark-only --workload=percolator --target-host=https://localhost:9200 --client-options=basic_auth_user:admin,basic_auth_password:admin,verify_certs:false --test-mode +``` +{% include copy.html %} + +When the `execute_test` command runs, all tasks and operations in the `percolator` workload run sequentially. + +### Validating the test + +After an OpenSearch Benchmark test runs, take the following steps to verify that it has run properly: + +- Note the number of documents in the OpenSearch or OpenSearch Dashboards index that you plan to run the benchmark against. +- In the results returned by OpenSearch Benchmark, compare the `workload.json` file for your specific workload and verify that the document count matches the number of documents. For example, based on the [percolator](https://github.com/opensearch-project/opensearch-benchmark-workloads/blob/main/percolator/workload.json#L19) `workload.json` file, you should expect to see `2000000` documents in your cluster. + +### Understanding the results + +OpenSearch Benchmark returns the following response once the benchmark completes: + +```bash +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +------------------------------------------------------ + +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|-------------------------------------------:|------------:|-------:| +| Cumulative indexing time of primary shards | | 0.02655 | min | +| Min cumulative indexing time across primary shards | | 0 | min | +| Median cumulative indexing time across primary shards | | 0.00176667 | min | +| Max cumulative indexing time across primary shards | | 0.0140333 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 0.0102333 | min | +| Cumulative merge count of primary shards | | 3 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0 | min | +| Max cumulative merge time across primary shards | | 0.0102333 | min | +| Cumulative merge throttle time of primary shards | | 0 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0 | min | +| Cumulative refresh time of primary shards | | 0.0709333 | min | +| Cumulative refresh count of primary shards | | 118 | | +| Min cumulative refresh time across primary shards | | 0 | min | +| Median cumulative refresh time across primary shards | | 0.00186667 | min | +| Max cumulative refresh time across primary shards | | 0.0511667 | min | +| Cumulative flush time of primary shards | | 0.00963333 | min | +| Cumulative flush count of primary shards | | 4 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 0 | min | +| Max cumulative flush time across primary shards | | 0.00398333 | min | +| Total Young Gen GC time | | 0 | s | +| Total Young Gen GC count | | 0 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 0.000485923 | GB | +| Translog size | | 2.01873e-05 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 32 | | +| Min Throughput | index | 3008.97 | docs/s | +| Mean Throughput | index | 3008.97 | docs/s | +| Median Throughput | index | 3008.97 | docs/s | +| Max Throughput | index | 3008.97 | docs/s | +| 50th percentile latency | index | 351.059 | ms | +| 100th percentile latency | index | 365.058 | ms | +| 50th percentile service time | index | 351.059 | ms | +| 100th percentile service time | index | 365.058 | ms | +| error rate | index | 0 | % | +| Min Throughput | wait-until-merges-finish | 28.41 | ops/s | +| Mean Throughput | wait-until-merges-finish | 28.41 | ops/s | +| Median Throughput | wait-until-merges-finish | 28.41 | ops/s | +| Max Throughput | wait-until-merges-finish | 28.41 | ops/s | +| 100th percentile latency | wait-until-merges-finish | 34.7088 | ms | +| 100th percentile service time | wait-until-merges-finish | 34.7088 | ms | +| error rate | wait-until-merges-finish | 0 | % | +| Min Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| Mean Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| Median Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| Max Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| 100th percentile latency | percolator_with_content_president_bush | 35.9822 | ms | +| 100th percentile service time | percolator_with_content_president_bush | 7.93048 | ms | +| error rate | percolator_with_content_president_bush | 0 | % | + +[...] + +| Min Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| Mean Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| Median Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| Max Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| 100th percentile latency | percolator_with_content_ignore_me | 131.798 | ms | +| 100th percentile service time | percolator_with_content_ignore_me | 69.5237 | ms | +| error rate | percolator_with_content_ignore_me | 0 | % | +| Min Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| Mean Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| Median Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| Max Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| 100th percentile latency | percolator_no_score_with_content_ignore_me | 45.5703 | ms | +| 100th percentile service time | percolator_no_score_with_content_ignore_me | 11.316 | ms | +| error rate | percolator_no_score_with_content_ignore_me | 0 | % | + + + +-------------------------------- +[INFO] SUCCESS (took 18 seconds) +-------------------------------- +``` + +Each task run by the `percolator` workload represents a specific OpenSearch API operation---such as Bulk or Search---that was performed when the test was run. Each task in the output summary contains the following information: + +* **Throughput:** The number of successful OpenSearch operations per second. +* **Latency:** The amount of time, including wait time, taken for the request and the response to be sent and received by Benchmark. +* **Service Time:** The amount of time, excluding wait time, taken for the request and the response to be sent and received by Benchmark. +* **Error Rate:** The percentage of operations run during the task that were not successful or returned a 200 error code. + +For more details about how the summary report is generated, see [Summary report]({{site.url}}{{site.baseurl}}/benchmark/reference/summary-report/). + + +## Running OpenSearch Benchmark on your own cluster + +Now that you're familiar with running OpenSearch Benchmark on a cluster, you can run OpenSearch Benchmark on your own cluster, using the same `execute-test` command, replacing the following settings. + + * Replace `https://localhost:9200` with your target cluster endpoint. This could be a URI like `https://search.mydomain.com` or a `HOST:PORT` specification. + * If the cluster is configured with basic authentication, replace the username and password in the command line with the appropriate credentials. + * Remove the `verify_certs:false` directive if you are not specifying `localhost` as your target cluster. This directive is needed only for clusters where SSL certificates are not set up. + * If you are using a `HOST:PORT`specification and plan to use SSL/TLS, either specify `https://`, or add the `use_ssl:true` directive to the `--client-options` string option. + * Remove the `--test-mode` flag to run the full workload, rather than an abbreviated test. + +You can copy the following command template to use in your own terminal: + +```bash +opensearch-benchmark execute-test --pipeline=benchmark-only --workload=percolator --target-host= --client-options=basic_auth_user:admin,basic_auth_password:admin +``` +{% include copy.html %} + +## Next steps + +See the following resources to learn more about OpenSearch Benchmark: + +- [User guide]({{site.url}}{{site.baseurl}}/benchmark/user-guide/index/): Dive deep into how OpenSearch Benchmark can you help you track the performance of your cluster. +- [Tutorials]({{site.url}}{{site.baseurl}}/benchmark/tutorials/index/): Use step-by-step guides for more advanced Benchmarking configurations and functionality. diff --git a/_benchmark/reference/commands/command-flags.md b/_benchmark/reference/commands/command-flags.md new file mode 100644 index 00000000..ca0606f0 --- /dev/null +++ b/_benchmark/reference/commands/command-flags.md @@ -0,0 +1,329 @@ +--- +layout: default +title: Command flags +nav_order: 51 +parent: Command reference +redirect_from: /benchmark/commands/command-flags/ +grand_parent: OpenSearch Benchmark Reference +--- + +# Command flags +OpenSearch Benchmark uses command line flags to change Benchmark's behavior. Not all flags can be used with each command. To find out which flags are supported by a specific command, enter `opensearch-benchmark --h`. + +All command flags are added to a command using the following syntax: + +```bash +opensearch-benchmark -- +``` + +Flags that accept comma-separated values, such `--telemetry`, can also accept a JSON array. This can be defined by passing a file path ending in `.json` or inline as a JSON string. + +- Comma-seperated values: `opensearch-benchmark ... --test-procedure="ingest-only,search-aggregations"` +- JSON file: `opensearch-benchmark ... --workload-params="params.json"` +- JSON inline string: `opensearch-benchmark ... --telemetry='["node-stats", "recovery-stats"]'` + + +## workload-path + + +This can be either a directory that contains a `workload.json` file or a `.json` file with an arbitrary name that contains a workload specification. `--workload-path` and `--workload-repository` as well as `--workload` are mutually exclusive. + + +## workload-repositor + + +This defines the repository from which OpenSearch Benchmark loads workloads. `--workload-path` and `--workload-repository` as well as `--workload` are mutually exclusive. + + +## workload-revision + + +Defines a specific revision from the workload source tree that OpenSearch Benchmark should use. + + +## workload + + +Defines the workload to use based on the workload's name. You can find a list of preloaded workloads using `opensearch-benchmark list workloads`. `--workload-path` and `--workload-repository` as well as `--workload` are mutually exclusive. + + +## workload-params + + +Defines which variables to inject into the workload. Variables injected must be available in the workload. To see which parameters are valid in the official workloads, select the workload from [the workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads). + + +## test-procedure + + +Defines the test procedures to use with each workload. You can find a list of test procedures that the workload supports by specifying the workload in the `info` command, for example, `opensearch-benchmark info --workload=`. To look up information on a specific test procedure, use the command `opensearch-benchmark info --workload= --test-procedure=`. + + +## test-execution-id + + +Defines a unique ID for the test run. + + +## include-tasks + + +Defines a comma-separated list of test procedure tasks to run. By default, all tasks listed in a test procedure array are run. + +Tests are executed in the order they are defined in `test-procedure`---not in the order they are defined in the command. + +All task filters are case sensitive. + + +## exclude-tasks + + +Defines a comma-separated list of test procedure tasks not to run. + + +## baseline + + +The baseline TestExecution ID used to compare the contender TestExecution. + + +## contender + + +The TestExecution ID for the contender being compared to the baseline. + + +## results-format + + +Defines the output format for the command line results, either `markdown` or `csv`. Default is `markdown`. + + + +## results-number-align + + +Defines the column number alignment for when the `compare` command outputs results. Default is `right`. + + +## results-file + + +When provided a file path, writes the compare results to the file indicated in the path. + + +## show-in-results + + +Determines whether or not to include the comparison in the results file. + + +## provision-config-repository + + +Defines the repository from which OpenSearch Benchmark loads `provision-configs` and `provision-config-instances`. + + +## provision-config-revision + + +Defines the specific Git revision in the `provision-config` that OpenSearch Benchmark should use. + + +## provision-config-path + + +Defines the path to the `--provision-config-instance` and any OpenSearch plugin configurations to use. + + +## distribution-version + + +Downloads the specified OpenSearch distribution based on version number. For a list of released OpenSearch versions, see [Version history](https://opensearch.org/docs/version-history/). + + +## distribution-repository + + +Defines the repository from which the OpenSearch distribution should be downloaded. Default is `release`. + + +## provision-config-instance + + +Defines the `--provision-config-instance` to use. You can view possible configuration instances by using the command `opensearch-benchmark list provision-config-instances`. + + +## provision-config-instance-params + + +A comma-separated list of key-value pairs injected verbatim as variables for the `provision-config-instance`. + + +## target-hosts + + +Defines a comma-separated list of host-port pairs that should be targeted if using the pipeline `benchmark-only`. Default is `localhost:9200`. + + +## target-os + + +The target operating system (OS) for which the OpenSearch artifact should be downloaded. Default is the current OS. + + +## target-arch + + +The name of the CPU architecture for which an artifact should be downloaded. + + +## revision + + +Defines the current source code revision to use for running a benchmark test. Default is `current`. + +This command flag can use the following options: + + - `current`: Uses the source tree's current revision based on your OpenSearch distribution. + - `latest`: Fetches the latest revision from the main branch of the source tree. + +You can also use a timestamp or commit ID from the source tree. When using a timestamp, specify `@ts`, where "ts" is a valid ISO 8601 timestamp, for example, `@2013-07-27T10:37:00Z`. + + +## opensearch-plugins + + +Defines which [OpenSearch plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/) to install. By default, no plugins are installed. + + +## plugin-params + + +Defines a comma-separated list of key-value pairs that are injected verbatim into all plugins as variables. + + +## runtime-jdk + + +The major version of JDK to use. + + + +## client-options + + +Defines a comma-separated list of clients to use. All options are passed to the OpenSearch Python client. Default is `timeout:60`. + + +## load-worker-coordinator-hosts + + +Defines a comma-separated list of hosts that coordinate loads. Default is `localhost`. + + +## enable-worker-coordinator-profiling + + +Enables a performance analysis of OpenSearch Benchmark's worker coordinator. Default is `false`. + + +## pipeline + + +The `--pipeline` option selects a pipeline to run. You can find a list of pipelines supported by OpenSearch Benchmark by running `opensearch-benchmark list pipelines`. + + +## telemetry + + +Enables the provided telemetry devices when the devices are provided using a comma-separated list. You can find a list of possible telemetry devices by using `opensearch-benchmark list telemetry`. + + +## telemetry-params + + +Enables setting parameters for telemetry devices. Accepts a list of comma-separated key-value pairs, each of which are delimited by a colon or a JSON file name. + + +## on-error + + +Controls how OpenSearch Benchmark responds to errors. Default is `continue`. + +You can use the following options with this command flag: + +- `continue`: Continues to run the test despite the error. +- `abort`: Aborts the test when an error occurs. + + +## preserve-install + + +Keeps the Benchmark candidate and its index. Default is `false`. + + +## kill-running-processes + + +When set to `true`, stops any OpenSearch Benchmark processes currently running and allows Benchmark to continue to run. Default is `false`. + + +## chart-spec-path + + +Sets the path to the JSON files containing chart specifications that can be used to generate charts. + + +## chart-type + + +Generates the indicated chart type, either `time-series` or `bar`. Default is `time-series`. + + +## output-path + + +The name and path used for the chart's output. Default is `stdout`. + + +## limit + + +Limits the number of search results for recent test runs. Default is `10`. + + +## latency-percentiles + + +Specifies a comma-separated list of latency percentiles to report after the workload runs. Accepts `ints` or `floats` with values between `0` and `100` inclusive. Does not accept `min`, `median`, `mean`, or `max`. Default is `50,90,99,99.9,99.99,100`. + + +## throughput-percentiles + + +Specifies a list of throughput percentiles to report after the workload runs, in addition to min/median/mean/max which is always displayed. Like `--latency-percentiles`, the setting accepts `ints` or `floats` with values between `0` and `100` inclusive. Does not accept `min`, `median`, `mean`, or `max`. Default is `None`. + + +## randomization-enabled + + +Enables randomization of values in range queries, where the values are drawn from standard value functions registered with `register_standard_value_source` in the workload's `workload.py` file. + +A standard value function is a no-argument function that generates a random pair of values for a certain field, in a dict with keys `"gte"`, `"lte"`, and optionally `"format"`. + +If this argument is `True` but a search operation does not have a registered standard value function, OpenSearch Benchmark raises a `SystemSetupError`. + +Default is `False`. + + +## randomization-repeat-frequency + + +Sets what fraction of randomized query values can be repeated. Takes values between `0.0` and `1.0`. Default is `0.3`. This setting does not work when `--randomization-enabled` is not used. + + +## randomization-n + + +Sets how many distinct repeatable pair values are generated for each operation when randomization is used. Default is `5000`. This setting does not work when `--randomization-enabled` is not used. diff --git a/_benchmark/reference/commands/compare.md b/_benchmark/reference/commands/compare.md new file mode 100644 index 00000000..dc7ed2f5 --- /dev/null +++ b/_benchmark/reference/commands/compare.md @@ -0,0 +1,136 @@ +--- +layout: default +title: compare +nav_order: 55 +parent: Command reference +grand_parent: OpenSearch Benchmark Reference +redirect_from: /benchmark/commands/compare/ +--- + + +# compare + + +The `compare` command helps you analyze the difference between two benchmark tests. This can help you analyze the performance impact of changes made from a previous test based on a specific Git revision. + +## Usage + +You can compare two different workload tests using their `TestExecution IDs`. To find a list of tests run from a specific workload, use `opensearch-benchmark list test_executions`. You should receive an output similar to the following: + + +``` + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ +Recent test-executions: + +Recent test_executions: + +TestExecution ID TestExecution Timestamp Workload Workload Parameters TestProcedure ProvisionConfigInstance User Tags workload Revision Provision Config Revision +------------------------------------ ------------------------- ---------- --------------------- ------------------- ------------------------- ----------- ------------------- --------------------------- +729291a0-ee87-44e5-9b75-cc6d50c89702 20230524T181718Z geonames append-no-conflicts 4gheap 30260cf +f91c33d0-ec93-48e1-975e-37476a5c9fe5 20230524T170134Z geonames append-no-conflicts 4gheap 30260cf +d942b7f9-6506-451d-9dcf-ef502ab3e574 20230524T144827Z geonames append-no-conflicts 4gheap 30260cf +a33845cc-c2e5-4488-a2db-b0670741ff9b 20230523T213145Z geonames append-no-conflicts + +``` + +Then use `compare` to call a `--baseline` test and a `--contender` test for comparison. + +``` +opensearch-benchmark compare --baseline=417ed42-6671-9i79-11a1-e367636068ce --contender=beb154e4-0a05-4f45-ad9f-e34f9a9e51f7 +``` + +You should receive the following response comparing the final benchmark metrics for both tests: + +``` + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + +Comparing baseline + TestExecution ID: 729291a0-ee87-44e5-9b75-cc6d50c89702 + TestExecution timestamp: 2023-05-24 18:17:18 + +with contender + TestExecution ID: a33845cc-c2e5-4488-a2db-b0670741ff9b + TestExecution timestamp: 2023-05-23 21:31:45 + + +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +------------------------------------------------------ + Metric Baseline Contender Diff +-------------------------------------------------------- ---------- ----------- ----------------- + Min Indexing Throughput [docs/s] 19501 19118 -383.00000 + Median Indexing Throughput [docs/s] 20232 19927.5 -304.45833 + Max Indexing Throughput [docs/s] 21172 20849 -323.00000 + Total indexing time [min] 55.7989 56.335 +0.53603 + Total merge time [min] 12.9766 13.3115 +0.33495 + Total refresh time [min] 5.20067 5.20097 +0.00030 + Total flush time [min] 0.0648667 0.0681833 +0.00332 + Total merge throttle time [min] 0.796417 0.879267 +0.08285 + Query latency term (50.0 percentile) [ms] 2.10049 2.15421 +0.05372 + Query latency term (90.0 percentile) [ms] 2.77537 2.84168 +0.06630 + Query latency term (100.0 percentile) [ms] 4.52081 5.15368 +0.63287 + Query latency country_agg (50.0 percentile) [ms] 112.049 110.385 -1.66392 + Query latency country_agg (90.0 percentile) [ms] 128.426 124.005 -4.42138 + Query latency country_agg (100.0 percentile) [ms] 155.989 133.797 -22.19185 + Query latency scroll (50.0 percentile) [ms] 16.1226 14.4974 -1.62519 + Query latency scroll (90.0 percentile) [ms] 17.2383 15.4079 -1.83043 + Query latency scroll (100.0 percentile) [ms] 18.8419 18.4241 -0.41784 + Query latency country_agg_cached (50.0 percentile) [ms] 1.70223 1.64502 -0.05721 + Query latency country_agg_cached (90.0 percentile) [ms] 2.34819 2.04318 -0.30500 +Query latency country_agg_cached (100.0 percentile) [ms] 3.42547 2.86814 -0.55732 + Query latency default (50.0 percentile) [ms] 5.89058 5.83409 -0.05648 + Query latency default (90.0 percentile) [ms] 6.71282 6.64662 -0.06620 + Query latency default (100.0 percentile) [ms] 7.65307 7.3701 -0.28297 + Query latency phrase (50.0 percentile) [ms] 1.82687 1.83193 +0.00506 + Query latency phrase (90.0 percentile) [ms] 2.63714 2.46286 -0.17428 + Query latency phrase (100.0 percentile) [ms] 5.39892 4.22367 -1.17525 + Median CPU usage (index) [%] 668.025 679.15 +11.12499 + Median CPU usage (stats) [%] 143.75 162.4 +18.64999 + Median CPU usage (search) [%] 223.1 229.2 +6.10000 + Total Young Gen GC time [s] 39.447 40.456 +1.00900 + Total Young Gen GC count 10 11 +1.00000 + Total Old Gen GC time [s] 7.108 7.703 +0.59500 + Total Old Gen GC count 10 11 +1.00000 + Index size [GB] 3.25475 3.25098 -0.00377 + Total written [GB] 17.8434 18.3143 +0.47083 + Heap used for segments [MB] 21.7504 21.5901 -0.16037 + Heap used for doc values [MB] 0.16436 0.13905 -0.02531 + Heap used for terms [MB] 20.0293 19.9159 -0.11345 + Heap used for norms [MB] 0.105469 0.0935669 -0.01190 + Heap used for points [MB] 0.773487 0.772155 -0.00133 + Heap used for points [MB] 0.677795 0.669426 -0.00837 + Segment count 136 121 -15.00000 + Indices Stats(90.0 percentile) [ms] 3.16053 3.21023 +0.04969 + Indices Stats(99.0 percentile) [ms] 5.29526 3.94132 -1.35393 + Indices Stats(100.0 percentile) [ms] 5.64971 7.02374 +1.37403 + Nodes Stats(90.0 percentile) [ms] 3.19611 3.15251 -0.04360 + Nodes Stats(99.0 percentile) [ms] 4.44111 4.87003 +0.42892 + Nodes Stats(100.0 percentile) [ms] 5.22527 5.66977 +0.44450 +``` + +## Options + +You can use the following options to customize the results of your test comparison: + +- `--baseline`: The baseline TestExecution ID used to compare the contender TestExecution. +- `--contender`: The TestExecution ID for the contender being compared to the baseline. +- `--results-format`: Defines the output format for the command line results, either `markdown` or `csv`. Default is `markdown`. +- `--results-number-align`: Defines the column number alignment for when the `compare` command outputs results. Default is `right`. +- `--results-file`: When provided a file path, writes the compare results to the file indicated in the path. +- `--show-in-results`: Determines whether or not to include the comparison in the results file. + + diff --git a/_benchmark/reference/commands/download.md b/_benchmark/reference/commands/download.md new file mode 100644 index 00000000..580e7ef8 --- /dev/null +++ b/_benchmark/reference/commands/download.md @@ -0,0 +1,44 @@ +--- +layout: default +title: download +nav_order: 60 +parent: Command reference +grand_parent: OpenSearch Benchmark Reference +redirect_from: /benchmark/commands/download/ +--- + + +# download + + +Use the `download` command to select which OpenSearch distribution version to download. + +## Usage + +The following example downloads OpenSearch version 2.7.0: + +``` +opensearch-benchmark download --distribution-version=2.7.0 +``` + +Benchmark then returns the location of the OpenSearch artifact: + +``` +{ + "opensearch": "/Users/.benchmark/benchmarks/distributions/opensearch-2.7.0.tar.gz" +} +``` + +## Options + +Use the following options to customize how OpenSearch Benchmark downloads OpenSearch: + +- `--provision-config-repository`: Defines the repository from which OpenSearch Benchmark loads `provision-configs` and `provision-config-instances`. +- `--provision-config-revision`: Defines a specific Git revision in the `provision-config` that OpenSearch Benchmark should use. +- `--provision-config-path`: Defines the path to the `--provision-config-instance` and any OpenSearch plugin configurations to use. +- `--distribution-version`: Downloads the specified OpenSearch distribution based on version number. For a list of released OpenSearch versions, see [Version history](https://opensearch.org/docs/version-history/). +- `--distribution-repository`: Defines the repository from where the OpenSearch distribution should be downloaded. Default is `release`. +- `--provision-config-instance`: Defines the `--provision-config-instance` to use. You can view possible configuration instances using the command `opensearch-benchmark list provision-config-instances`. +- `--provision-config-instance-params`: A comma-separated list of key-value pairs injected verbatim as variables for the `provision-config-instance`. +- `--target-os`: The target operating system (OS) for which the OpenSearch artifact should be downloaded. Default is the current OS. +- `--target-arch`: The name of the CPU architecture for which an artifact should be downloaded. diff --git a/_benchmark/reference/commands/execute-test.md b/_benchmark/reference/commands/execute-test.md new file mode 100644 index 00000000..d8a3e14d --- /dev/null +++ b/_benchmark/reference/commands/execute-test.md @@ -0,0 +1,182 @@ +--- +layout: default +title: execute-test +nav_order: 65 +parent: Command reference +grand_parent: OpenSearch Benchmark Reference +redirect_from: /benchmark/commands/execute-test/ +--- + + +# execute-test + + +Whether you're using the included [OpenSearch Benchmark workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) or a [custom workload]({{site.url}}{{site.baseurl}}/benchmark/creating-custom-workloads/), use the `execute-test` command to gather data about the performance of your OpenSearch cluster according to the selected workload. + +## Usage + +The following example executes a test using the `geonames` workload in test mode: + +``` +opensearch-benchmark execute-test --workload=geonames --test-mode +``` + +After the test runs, OpenSearch Benchmark responds with a summary of the benchmark metrics: + +``` +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +------------------------------------------------------ + +| Metric | Task | Value | Unit | +|-------------------------------:|---------------------:|----------:|-------:| +| Total indexing time | | 28.0997 | min | +| Total merge time | | 6.84378 | min | +| Total refresh time | | 3.06045 | min | +| Total flush time | | 0.106517 | min | +| Total merge throttle time | | 1.28193 | min | +| Median CPU usage | | 471.6 | % | +| Total Young Gen GC | | 16.237 | s | +| Total Old Gen GC | | 1.796 | s | +| Index size | | 2.60124 | GB | +| Total written | | 11.8144 | GB | +| Heap used for segments | | 14.7326 | MB | +| Heap used for doc values | | 0.115917 | MB | +| Heap used for terms | | 13.3203 | MB | +| Heap used for norms | | 0.0734253 | MB | +| Heap used for points | | 0.5793 | MB | +| Heap used for stored fields | | 0.643608 | MB | +| Segment count | | 97 | | +| Min Throughput | index-append | 31925.2 | docs/s | +| Median Throughput | index-append | 39137.5 | docs/s | +| Max Throughput | index-append | 39633.6 | docs/s | +| 50.0th percentile latency | index-append | 872.513 | ms | +| 90.0th percentile latency | index-append | 1457.13 | ms | +| 99.0th percentile latency | index-append | 1874.89 | ms | +| 100th percentile latency | index-append | 2711.71 | ms | +| 50.0th percentile service time | index-append | 872.513 | ms | +| 90.0th percentile service time | index-append | 1457.13 | ms | +| 99.0th percentile service time | index-append | 1874.89 | ms | +| 100th percentile service time | index-append | 2711.71 | ms | +| ... | ... | ... | ... | +| ... | ... | ... | ... | +| Min Throughput | painless_dynamic | 2.53292 | ops/s | +| Median Throughput | painless_dynamic | 2.53813 | ops/s | +| Max Throughput | painless_dynamic | 2.54401 | ops/s | +| 50.0th percentile latency | painless_dynamic | 172208 | ms | +| 90.0th percentile latency | painless_dynamic | 310401 | ms | +| 99.0th percentile latency | painless_dynamic | 341341 | ms | +| 99.9th percentile latency | painless_dynamic | 344404 | ms | +| 100th percentile latency | painless_dynamic | 344754 | ms | +| 50.0th percentile service time | painless_dynamic | 393.02 | ms | +| 90.0th percentile service time | painless_dynamic | 407.579 | ms | +| 99.0th percentile service time | painless_dynamic | 430.806 | ms | +| 99.9th percentile service time | painless_dynamic | 457.352 | ms | +| 100th percentile service time | painless_dynamic | 459.474 | ms | + +---------------------------------- +[INFO] SUCCESS (took 2634 seconds) +---------------------------------- +``` + +## Options + +Use the following options to customize the `execute-test` command for your use case. Options in this section are categorized by their use case. + +## General settings + +The following options shape how each test runs and how results appear: + +- `--test-mode`: Runs the given workload in test mode, which is useful when checking a workload for errors. +- `--user-tag`: Defines user-specific key-value pairs to be used in metric record as meta information, for example, `intention:baseline-ticket-12345`. +- `--results-format`: Defines the output format for the command line results, either `markdown` or `csv`. Default is `markdown`. +- `--results-number-align`: Defines the column number alignment for when the `compare` command outputs results. Default is `right`. +- `--results-file`: When provided a file path, writes the compare results to the file indicated in the path. +- `--show-in-results`: Determines whether or not to include the comparison in the results file. + + +### Distributions + +The following options set which version of OpenSearch and the OpenSearch plugins the benchmark test uses: + +- `--distribution-version`: Downloads the specified OpenSearch distribution based on version number. For a list of released OpenSearch versions, see [Version history](https://opensearch.org/docs/version-history/). +- `--distribution-repository`: Defines the repository from where the OpenSearch distribution should be downloaded. Default is `release`. +- `--revision`: Defines the current source code revision to use for running a benchmark test. Default is `current`. + - `current`: Uses the source tree's current revision based on your OpenSearch distribution. + - `latest`: Fetches the latest revision from the main branch of the source tree. + - You can also use a timestamp or commit ID from the source tree. When using a timestamp, specify `@ts`, where "ts" is a valid ISO 8601 timestamp, for example, `@2013-07-27T10:37:00Z`. +- `--opensearch-plugins`: Defines which [OpenSearch plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/) to install. By default, no plugins are installed. +- `--plugin-params:` Defines a comma-separated list of key:value pairs that are injected verbatim into all plugins as variables. +- `--runtime-jdk`: The major version of JDK to use. +- `--client-options`: Defines a comma-separated list of clients to use. All options are passed to the OpenSearch Python client. Default is `timeout:60`. + +### Cluster + +The following option relates to the target cluster of the benchmark. + +- `--target-hosts`: Defines a comma-separated list of host-port pairs that should be targeted if using the pipeline `benchmark-only`. Default is `localhost:9200`. + + +### Distributed workload generation + +The following options help those who want to use multiple hosts to generate load to the benchmark cluster: + +- `--load-worker-coordinator-hosts`: Defines a comma-separated list of hosts that coordinate loads. Default is `localhost`. +- `--enable-worker-coordinator-profiling`: Enables an analysis of the performance of OpenSearch Benchmark's worker coordinator. Default is `false`. + +### Provisioning + +The following options help customize how OpenSearch Benchmark provisions OpenSearch and workloads: + +- `--provision-config-repository`: Defines the repository from which OpenSearch Benchmark loads `provision-configs` and `provision-config-instances`. +- `--provision-config-path`: Defines the path to the `--provision-config-instance` and any OpenSearch plugin configurations to use. +- `--provision-config-revision`: Defines a specific Git revision in the `provision-config` that OpenSearch Benchmark should use. +- `--provision-config-instance`: Defines the `--provision-config-instance` to use. You can see possible configuration instances using the command `opensearch-benchmark list provision-config-instances`. +- `--provision-config-instance-params`: A comma-separated list of key-value pairs injected verbatim as variables for the `provision-config-instance`. + + +### Workload + +The following options determine which workload is used to run the test: + +- `--workload-repository`: Defines the repository from which OpenSearch Benchmark loads workloads. +- `--workload-path`: Defines the path to a downloaded or custom workload. +- `--workload-revision`: Defines a specific revision from the workload source tree that OpenSearch Benchmark should use. +- `--workload`: Defines the workload to use based on the workload's name. You can find a list of preloaded workloads using `opensearch-benchmark list workloads`. + +### Test procedures + +The following options define what test procedures the test uses and which operations are contained inside the procedure: + +- `--test-execution-id`: Defines a unique ID for this test run. +Defines the test procedures to use with each workload. You can find a list of test procedures that the workload supports by specifying the workload in the `info` command, for example, `opensearch-benchmark info --workload=`. To look up information on a specific test procedure, use the command `opensearch-benchmark info --workload= --test-procedure=`. +- `--include-tasks`: Defines a comma-separated list of test procedure tasks to run. By default, all tasks listed in a test procedure array are run. +- `--exclude-tasks`: Defines a comma-separated list of test procedure tasks not to run. +- `--enable-assertions`: Enables assertion checks for tasks. Default is `false`. + +### Pipelines + +The `--pipeline` option selects a pipeline to run. You can find a list of pipelines supported by OpenSearch Benchmark by running `opensearch-benchmark list pipelines`. + + +### Telemetry + +The following options enable telemetry devices on OpenSearch Benchmark: + +- `--telemetry`: Enables the provided telemetry devices when the devices are provided using a comma-separated list. You can find a list of possible telemetry devices by using `opensearch-benchmark list telemetry`. +- `--telemetry-params`: Defines a comma-separated list of key-value pairs that are injected verbatim into the telemetry devices as parameters. + + +### Errors + +The following options set how OpenSearch Benchmark handles errors when running tests: + +- `--on-error`: Controls how OpenSearch Benchmark responds to errors. Default is `continue`. + - `continue`: Continues to run the test despite the error. + - `abort`: Aborts the test when an error occurs. +- `--preserve-install`: Keeps the Benchmark candidate and its index. Default is `false`. +- `--kill-running-processes`: When set to `true`, stops any OpenSearch Benchmark processes currently running and allows OpenSearch Benchmark to continue to run. Default is `false`. diff --git a/_benchmark/reference/commands/index.md b/_benchmark/reference/commands/index.md new file mode 100644 index 00000000..12276d17 --- /dev/null +++ b/_benchmark/reference/commands/index.md @@ -0,0 +1,27 @@ +--- +layout: default +title: Command reference +nav_order: 50 +has_children: true +parent: OpenSearch Benchmark Reference +redirect_from: /benchmark/commands/index/ +--- + +# OpenSearch Benchmark command reference + +This section provides a list of commands supported by OpenSearch Benchmark, including commonly used commands such as `execute-test` and `list`. + +- [compare]({{site.url}}{{site.baseurl}}/benchmark/commands/compare/) +- [download]({{site.url}}{{site.baseurl}}/benchmark/commands/download/) +- [execute-test]({{site.url}}{{site.baseurl}}/benchmark/commands/execute-test/) +- [info]({{site.url}}{{site.baseurl}}/benchmark/commands/info/) +- [list]({{site.url}}{{site.baseurl}}/benchmark/commands/list/) + +## List of common options + +All OpenSearch Benchmark commands support the following options: + +- `--h` or `--help`: Provides options and other useful information about each command. +- `--quiet`: Hides as much of the results output as possible. Default is `false`. +- `--offline`: Indicates whether OpenSearch Benchmark has a connection to the internet. Default is `false`. + diff --git a/_benchmark/reference/commands/info.md b/_benchmark/reference/commands/info.md new file mode 100644 index 00000000..c8c20ad1 --- /dev/null +++ b/_benchmark/reference/commands/info.md @@ -0,0 +1,162 @@ +--- +layout: default +title: info +nav_order: 75 +parent: Command reference +grand_parent: OpenSearch Benchmark Reference +redirect_from: /benchmark/commands/info/ +--- + + +# info + + +The `info` command prints details about an OpenSearch Benchmark component. + +## Usage + +The following example returns information about a workload named `nyc_taxis`: + +``` +opensearch-benchmark info --workload=nyc_taxis +``` + +OpenSearch Benchmark returns information about the workload, as shown in the following example response: + +``` + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + +Showing details for workload [nyc_taxis]: + +* Description: Taxi rides in New York in 2015 +* Documents: 165,346,692 +* Compressed Size: 4.5 GB +* Uncompressed Size: 74.3 GB + +=================================== +TestProcedure [searchable-snapshot] +=================================== + +Measuring performance for Searchable Snapshot feature. Based on the default test procedure 'append-no-conflicts'. + +Schedule: +---------- + +1. delete-index +2. create-index +3. check-cluster-health +4. index (8 clients) +5. refresh-after-index +6. force-merge +7. refresh-after-force-merge +8. wait-until-merges-finish +9. create-snapshot-repository +10. delete-snapshot +11. create-snapshot +12. wait-for-snapshot-creation +13. delete-local-index +14. restore-snapshot +15. default +16. range +17. distance_amount_agg +18. autohisto_agg +19. date_histogram_agg + +==================================================== +TestProcedure [append-no-conflicts] (run by default) +==================================================== + +Indexes the entire document corpus using a setup that will lead to a larger indexing throughput than the default settings and produce a smaller index (higher compression rate). Document IDs are unique, so all index operations are append only. After that, a couple of queries are run. + +Schedule: +---------- + +1. delete-index +2. create-index +3. check-cluster-health +4. index (8 clients) +5. refresh-after-index +6. force-merge +7. refresh-after-force-merge +8. wait-until-merges-finish +9. default +10. range +11. distance_amount_agg +12. autohisto_agg +13. date_histogram_agg + +============================================== +TestProcedure [append-no-conflicts-index-only] +============================================== + +Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings and produce a smaller index (higher compression rate). Document ids are unique so all index operations are append only. + +Schedule: +---------- + +1. delete-index +2. create-index +3. check-cluster-health +4. index (8 clients) +5. refresh-after-index +6. force-merge +7. refresh-after-force-merge +8. wait-until-merges-finish + +===================================================== +TestProcedure [append-sorted-no-conflicts-index-only] +===================================================== + +Indexes the whole document corpus in an index sorted by pickup_datetime field in descending order (most recent first) and using a setup that will lead to a larger indexing throughput than the default settings and produce a smaller index (higher compression rate). Document ids are unique so all index operations are append only. + +Schedule: +---------- + +1. delete-index +2. create-index +3. check-cluster-health +4. index (8 clients) +5. refresh-after-index +6. force-merge +7. refresh-after-force-merge +8. wait-until-merges-finish + +====================== +TestProcedure [update] +====================== + +Schedule: +---------- + +1. delete-index +2. create-index +3. check-cluster-health +4. update (8 clients) +5. refresh-after-index +6. force-merge +7. refresh-after-force-merge +8. wait-until-merges-finish + + +------------------------------- +[INFO] SUCCESS (took 2 seconds) +------------------------------- +``` + +## Options + +You can use the following options with the `info` command: + + +- `--workload-repository`: Defines the repository from where OpenSearch Benchmark loads workloads. +- `--workload-path`: Defines the path to a downloaded or custom workload. +- `--workload-revision`: Defines a specific revision from the workload source tree that OpenSearch Benchmark should use. +- `--workload`: Defines the workload to use based on the workload's name. You can find a list of preloaded workloads using `opensearch-benchmark list workloads`. +- `--test-procedure`: Defines a test procedure to use. You can find a list of test procedures using `opensearch-benchmark list test_procedures`. +- `--include-tasks`: Defines a comma-separated list of test procedure tasks to run. By default, all tasks listed in a test procedure array are run. +- `--exclude-tasks`: Defines a comma-separated list of test procedure tasks not to run. diff --git a/_benchmark/reference/commands/list.md b/_benchmark/reference/commands/list.md new file mode 100644 index 00000000..ed10f02a --- /dev/null +++ b/_benchmark/reference/commands/list.md @@ -0,0 +1,74 @@ +--- +layout: default +title: list +nav_order: 80 +parent: Command reference +grand_parent: OpenSearch Benchmark Reference +redirect_from: /benchmark/commands/list/ +--- + + +# list + + +The `list` command lists the following elements used by OpenSearch Benchmark: + +- `telemetry`: Telemetry devices +- `workloads`: Workloads +- `pipelines`: Pipelines +- `test_executions`: Single run of a workload +- `provision_config_instances`: Provisioned configuration instances +- `opensearch-plugins`: OpenSearch plugins + + +## Usage + +The following example lists any workload test runs and detailed information about each test: + +``` +`opensearch-benchmark list test_executions +``` + +OpenSearch Benchmark returns information about each test. + +``` +benchmark list test_executions + + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + + +Recent test_executions: + +TestExecution ID TestExecution Timestamp Workload Workload Parameters TestProcedure ProvisionConfigInstance User Tags workload Revision Provision Config Revision +------------------------------------ ------------------------- ---------- --------------------- ------------------- ------------------------- ----------- ------------------- --------------------------- +729291a0-ee87-44e5-9b75-cc6d50c89702 20230524T181718Z geonames append-no-conflicts 4gheap 30260cf +f91c33d0-ec93-48e1-975e-37476a5c9fe5 20230524T170134Z geonames append-no-conflicts 4gheap 30260cf +d942b7f9-6506-451d-9dcf-ef502ab3e574 20230524T144827Z geonames append-no-conflicts 4gheap 30260cf +a33845cc-c2e5-4488-a2db-b0670741ff9b 20230523T213145Z geonames append-no-conflicts 4gheap 30260cf +ba643ed3-0db5-452e-a680-2b0dc0350cf2 20230522T224450Z geonames append-no-conflicts external 30260cf +8d366ec5-3322-4e09-b041-a4b02e870033 20230519T201514Z geonames append-no-conflicts external 30260cf +4574c13e-8742-41af-a4fa-79480629ecf0 20230519T195617Z geonames append-no-conflicts external 30260cf +3e240d18-fc87-4c49-9712-863196efcef4 20230519T195412Z geonames append-no-conflicts external 30260cf +90f066ae-3d83-41e9-bbeb-17cb0480d578 20230519T194448Z geonames append-no-conflicts external 30260cf +78602e07-0ff8-4f00-9a0e-746fb64e4129 20230519T193258Z geonames append-no-conflicts external 30260cf + +------------------------------- +[INFO] SUCCESS (took 0 seconds) +------------------------------- +``` + +## Options + +You can use the following options with the `test` command: + +- `--limit`: Limits the number of search results for recent test runs. Default is `10`. +- `--workload-repository`: Defines the repository from where OpenSearch Benchmark loads workloads. +- `--workload-path`: Defines the path to a downloaded or custom workload. +- `--workload-revision`: Defines a specific revision from the workload source tree that OpenSearch Benchmark should use. + + diff --git a/_benchmark/reference/index.md b/_benchmark/reference/index.md new file mode 100644 index 00000000..bc7a423b --- /dev/null +++ b/_benchmark/reference/index.md @@ -0,0 +1,10 @@ +--- +layout: default +title: OpenSearch Benchmark Reference +nav_order: 25 +has_children: true +--- + +# OpenSearch Benchmark Reference + +The following section contains reference materials for commands, metrics, and workloads for OpenSearch Benchmark. \ No newline at end of file diff --git a/_benchmark/reference/metrics/index.md b/_benchmark/reference/metrics/index.md new file mode 100644 index 00000000..63e5a799 --- /dev/null +++ b/_benchmark/reference/metrics/index.md @@ -0,0 +1,63 @@ +--- +layout: default +title: Metrics reference +nav_order: 25 +has_children: true +parent: OpenSearch Benchmark Reference +redirect_from: /benchmark/metrics/index/ +--- + +# Metrics + +After a workload completes, OpenSearch Benchmark stores all metric records within its metrics store. These metrics can be kept in memory or in an OpenSearch cluster. + +## Storing metrics + +You can specify whether metrics are stored in memory or in a metrics store while running the benchmark by setting the [`datastore.type`](https://opensearch.org/docs/latest/benchmark/configuring-benchmark/#results_publishing) parameter in your `benchmark.ini` file. + +### In memory + +If you want to store metrics in memory while running the benchmark, provide the following settings in the `results_publishing` section of `benchmark.ini`: + +```ini +[results_publishing] +datastore.type = in-memory +datastore.host = +datastore.port = +datastore.secure = False +datastore.ssl.verification_mode = +datastore.user = +datastore.password = +``` + +### OpenSearch + +If you want to store metrics in an external OpenSearch memory store while running the benchmark, provide the following settings in the `results_publishing` section of `benchmark.ini`: + +```ini +[results_publishing] +datastore.type = opensearch +datastore.host = +datastore.port = 443 +datastore.secure = true +datastore.ssl.verification_mode = none +datastore.user = +datastore.password = +datastore.number_of_replicas = +datastore.number_of_shards = +``` +When neither `datastore.number_of_replicas` nor `datastore.number_of_shards` is provided, OpenSearch uses the default values: `0` for the number of replicas and `1` for the number of shards. If these settings are changed after the data store cluster is created, the new replica and shard settings will only apply when new results indexes are created at the end of the month. + +After you run OpenSearch Benchmark configured to use OpenSearch as a data store, OpenSearch Benchmark creates three indexes: + +- `benchmark-metrics-YYYY-MM`: Holds granular metric and telemetry data. +- `benchmark-results-YYYY-MM`: Holds data based on final results. +- `benchmark-test-executions-YYYY-MM`: Holds data about `execution-ids`. + +You can visualize data inside these indexes in OpenSearch Dashboards. + + +## Next steps + +- For more information about how to design a metrics store, see [Metric records]({{site.url}}{{site.baseurl}}/benchmark/metrics/metric-records/). +- For more information about what metrics are stored, see [Metric keys]({{site.url}}{{site.baseurl}}/benchmark/metrics/metric-keys/). diff --git a/_benchmark/reference/metrics/metric-keys.md b/_benchmark/reference/metrics/metric-keys.md new file mode 100644 index 00000000..aa45c9d9 --- /dev/null +++ b/_benchmark/reference/metrics/metric-keys.md @@ -0,0 +1,49 @@ +--- +layout: default +title: Metric keys +nav_order: 35 +parent: Metrics reference +grand_parent: OpenSearch Benchmark Reference +redirect_from: /benchmark/metrics/metric-keys/ +--- + +# Metric keys + +Metric keys are the metrics that OpenSearch Benchmark stores, based on the configuration in the [metrics record]({{site.url}}{{site.baseurl}}/benchmark/metrics/metric-records/). OpenSearch Benchmark stores the following metrics: + + +- `latency`: The time period between submitting a request and receiving the complete response. This also includes wait time, such as the time the request spends waiting until it is ready to be serviced by OpenSearch Benchmark. +- `service_time`: The time period between sending a request and receiving the corresponding response. This metric is similar to latency but does not include wait time. +- `processing_time`: The time period between starting to process a request and receiving the complete response. Contrary to service time, this metric also includes the OpenSearch Benchmark client-side processing overhead. Large differences between service time and processing time indicate a high overhead in the client and can thus point to a potential client-side bottleneck, which requires investigation. +- `throughput`: The number of operations that OpenSearch Benchmark can perform within a certain time period, usually per second. See the [workload reference]({{site.url}}{{site.baseurl}}/benchmark/workloads/index/) for definitions of operation types. +- `disk_io_write_bytes`: The number of bytes written to disk during the benchmark. On Linux, this metric corresponds to only the bytes that have been written by OpenSearch Benchmark. On Mac OS, it includes the number of bytes written by all processes. +- `disk_io_read_bytes`: The number of bytes read from disk during the benchmark. On MacOS, this includes the number of bytes written by all processes. +- `node_startup_time`: The amount of time, in seconds, from the start of the process until the node is running. +- `node_total_young_gen_gc_time`: The total runtime of the young-generation garbage collector across the whole cluster, as reported by the Nodes Stats API. +- `node_total_young_gen_gc_count`: The total number of young-generation garbage collections across the whole cluster, as reported by the Nodes Stats API. +- `node_total_old_gen_gc_time`: The total runtime of the old-generation garbage collector across the whole cluster, as reported by the Nodes Stats API. +- `node_total_old_gen_gc_count`: The total number of old-generation garbage collections across the whole cluster, as reported by the Nodes Stats API. +- `node_total_zgc_cycles_gc_time`: The total time spent by the Z Garbage Collector (ZGC) on garbage collecting across the whole cluster, as reported by the Nodes Stats API. +- `node_total_zgc_cycles_gc_count`: The total number of garbage collections ZGC performed across the whole cluster, as reported by the Nodes Stats API. +- `node_total_zgc_pauses_gc_time`: The total time ZGC spent in Stop-The-World pauses across the whole cluster, as reported by the Nodes Stats API. +- `node_total_zgc_pauses_gc_count`: The total number of Stop-The-World pauses during ZGC execution across the whole cluster, as reported by the Nodes Stats API. +- `segments_count`: The total number of open segments, as reported by the Index Stats API. +- `segments_memory_in_bytes`: The total number of bytes used for all open segments, as reported by the Index Stats API. +- `segments_doc_values_memory_in_bytes`: The number of bytes used for document values, as reported by the Index Stats API. +- `segments_stored_fields_memory_in_bytes`: The number of bytes used for stored fields, as reported by the Index Stats API. +- `segments_terms_memory_in_bytes`: The number of bytes used for terms, as reported by the Index Stats API. +- `segments_norms_memory_in_bytes`: The number of bytes used for norms, as reported by the Index Stats API. +- `segments_points_memory_in_bytes`: The number of bytes used for points, as reported by the Index Stats API. +- `merges_total_time`: The cumulative runtime of merges for primary shards, as reported by the Index Stats API. Note that this time is not wall clock time. If M merge threads ran for N minutes, Benchmark reports the amount of time as M * N minutes, not N minutes. These metrics records have an additional per-shard property that contains the times across primary shards in an array. +- `merges_total_count`: The cumulative number of merges of primary shards, as reported by Index Stats API under `_all/primaries`. +- `merges_total_throttled_time`: The cumulative time for merges that have been throttled, as reported by the Index Stats API. Note that this time is not wall clock time. These metrics records have an additional per-shard property that contains the times across primary shards in an array. +- `indexing_total_time`: The cumulative time used for indexing of primary shards, as reported by the Index Stats API. Note that this is not wall clock time. These metrics records have an additional per-shard property that contains the times across primary shards in an array. +- `indexing_throttle_time`: The cumulative time during which indexing has been throttled, as reported by the Index Stats API. Note that this is not wall clock time. These metrics records have an additional per-shard property that contains the times across primary shards in an array. +- `refresh_total_time`: The cumulative time used for index refresh of primary shards, as reported by the Index Stats API. Note that this is not wall clock time. These metrics records have an additional per-shard property that contains the times across primary shards in an array. +- `refresh_total_count`: The cumulative number of refreshes of primary shards, as reported by the Index Stats API under `_all/primaries`. +- `flush_total_time`: The cumulative time used for index flush of primary shards, as reported by the Index Stats API. Note that this is not wall clock time. These metrics records have an additional per-shard property that contains the times across primary shards in an array. +- `flush_total_count`: The cumulative number of flushes of primary shards, as reported by the Index Stats API under `_all/primaries`. +- `final_index_size_bytes`: The final index size on the file system after all nodes have been shut down at the end of the benchmark, in bytes. It includes all files in the nodes’ data directories, such as index files and the translog. +- `store_size_in_bytes`: The size of the index, excluding the translog, as reported by the Index Stats API, in bytes . +- `translog_size_in_bytes`: The size of the translog, as reported by the Index Stats API, in bytes. +- `ml_processing_time`: An object containing the minimum, mean, median, and maximum bucket processing time per machine learning job, in milliseconds. These metrics are only available if a machine learning job has been created in the respective benchmark. diff --git a/_benchmark/reference/metrics/metric-records.md b/_benchmark/reference/metrics/metric-records.md new file mode 100644 index 00000000..659f8b82 --- /dev/null +++ b/_benchmark/reference/metrics/metric-records.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Metric records +nav_order: 30 +parent: Metrics reference +grand_parent: OpenSearch Benchmark Reference +redirect_from: /benchmark/metrics/metric-records/ +--- + +# Metric records + +OpenSearch Benchmark stores metrics in the `benchmark-metrics-*` indexes. A new index is created each month. The following is an example metric record stored in the `benchmark-metrics-2023-08` index: + +```json +{ + "_index": "benchmark-metrics-2023-08", + "_id": "UiNY4YkBpMtdJ7uj2rUe", + "_version": 1, + "_score": null, + "_source": { + "@timestamp": 1691702842821, + "relative-time-ms": 65.90720731765032, + "test-execution-id": "8c43ee4c-cb34-494b-81b2-181be244f832", + "test-execution-timestamp": "20230810T212711Z", + "environment": "local", + "workload": "geonames", + "test_procedure": "append-no-conflicts", + "provision-config-instance": "external", + "name": "service_time", + "value": 607.8001195564866, + "unit": "ms", + "sample-type": "normal", + "meta": { + "source_revision": "unknown", + "distribution_version": "1.1.0", + "distribution_flavor": "oss", + "index": "geonames", + "took": 13, + "success": true, + "success-count": 125, + "error-count": 0 + }, + "task": "index-append", + "operation": "index-append", + "operation-type": "bulk" + }, + "fields": { + "@timestamp": [ + "2023-08-10T21:27:22.821Z" + ], + "test-execution-timestamp": [ + "2023-08-10T21:27:11.000Z" + ] + }, + "highlight": { + "workload": [ + "@opensearch-dashboards-highlighted-field@geonames@/opensearch-dashboards-highlighted-field@" + ], + "meta.index": [ + "@opensearch-dashboards-highlighted-field@geonames@/opensearch-dashboards-highlighted-field@" + ] + }, + "sort": [ + 1691702831000 + ] +} +``` + +The following fields found in the `_source` section of the metric's record are configurable in the `opensearch-benchmarks-metrics-*` file. + + +## @timestamp + + +The timestamp of when the sample was taken since the epoch, in milliseconds. For request-related metrics, such as `latency` or `service_time`, this is the timestamp of when OpenSearch Benchmark issued the request. + + +## relative-time-ms + + +The relative time since the start of the benchmark, in milliseconds. This is useful for comparing time-series graphs across multiple tests. For example, you can compare the indexing throughput over time across multiple tests. + + +## test-execution-id + + +A UUID that changes on every invocation of the workload. It is intended to group all samples of a benchmarking run. + + +## test-execution-timestamp + + +The timestamp of when the workload was invoked (always in UTC). + + +## environment + + +The `environment` describes the origin of a metric record. This is defined when initially [configuring]({{site.url}}{{site.baseurl}}/benchmark/configuring-benchmark/) OpenSearch Benchmark. You can use separate environments for different benchmarks but store the metric records in the same index. + + +## workload, test_procedure, provision-config-instance + + +The workload, test procedures, and configuration instances for which the metrics are produced. + + +## name, value, unit + + +The actual metric name and value, with an optional unit. Depending on the nature of a metric, it is either sampled periodically by OpenSearch Benchmark, for example, CPU utilization or query latency, or measured once, for example, the final size of the index. + + +## sample-type + + +Determines whether to configure a benchmark to run in warmup mode by setting it to `warmup` or `normal`. Only `normal` samples are considered for the results that are reported. + + +## meta + + +The meta information for each metric record, including the following: + +- CPU info: The number of physical and logical cores and the model name. +- OS info: The name and version of the operating system. +- Hostname. +- Node name: A unique name given to each node when OpenSearch Benchmark provisions the cluster. +- Source revision: The Git hash of the version of OpenSearch that is benchmarked. +- Distribution version: The distribution version of OpenSearch that is benchmarked. +- Custom tags: You can define custom tags by using the command line flag `--user-tags`. The tags are prefixed by `tag_` in order to avoid accidental clashes with OpenSearch Benchmark internal tags. +- Operation specific: An optional substructure of the operation. For bulk requests, this may be the number of documents; for searches, the number of hits. + +Depending on the metric record, some meta information might be missing. + +## Next steps + +- For more information about how to access OpenSearch Benchmark metrics, see [Metrics]({{site.url}}{{site.baseurl}}/benchmark/metrics/index/). +- For more information about the metrics stored in OpenSearch Benchmark, see [Metric keys]({{site.url}}{{site.baseurl}}/benchmark/metrics/metric-keys/). diff --git a/_benchmark/reference/summary-report.md b/_benchmark/reference/summary-report.md new file mode 100644 index 00000000..d5396cd3 --- /dev/null +++ b/_benchmark/reference/summary-report.md @@ -0,0 +1,260 @@ +--- +layout: default +title: Summary report +nav_order: 40 +parent: Metrics +--- + +# Summary report + +At the end of each run, OpenSearch Benchmark shows a summary report based on the metric keys defined in the workload. This page gives details on each line of the summary report and that line's associated metric key. + +## Cumulative indexing time of primary shards + +**Corresponding metrics key**: `indexing_total_time` + +The cumulative time used for indexing as reported by the Index Stats API. Note that this is not wall-clock time, for example, if M indexing threads ran for N minutes, report M * N minutes, not N minutes. + +## Cumulative indexing time across primary shards + +**Corresponding metrics key:** `indexing_total_time` (property: `per-shard`) + +The minimum, median, and maximum cumulative time used for indexing across primary shards as reported by the Index Stats API. + +## Cumulative indexing throttle time of primary shards + +**Corresponding metrics key:** `indexing_throttle_time` + +The cumulative time that the indexing has been throttled as reported by the Index Stats API. Note that this is not wall-clock time, for example, if M indexing threads ran for N minutes, report M * N minutes, not N minutes. + + +## Cumulative indexing throttle time across primary shards + +**Corresponding metrics key:** `indexing_throttle_time` (property: `per-shard`) + +The minimum, median, and maximum cumulative time used that indexing has been throttled across primary shards as reported by the Index Stats API. + + +## Cumulative merge time of primary shards + +**Corresponding metrics key:** `merges_total_time` + +The cumulative runtime of merges of primary shards, as reported by the index stats API. Note that this is not wall-clock time. + +## Cumulative merge count of primary shards + +**Corresponding metrics key:** `merges_total_count` + +The cumulative number of merges of primary shards, as reported by the Index Stats API under `_all/primaries`. + + +## Cumulative merge time across primary shards + +**Corresponding metrics key:** `merges_total_time` (property: `per-shard`) + +The minimum, median, and maximum cumulative time of merges across primary shards as reported by the Index Stats API. + + +## Cumulative refresh time of primary shards + +**Corresponding metrics key**: `refresh_total_time` + +The cumulative time used for index refresh of primary shards as reported by the Index Stats API. Note that this is not wall-clock time. + +## Cumulative refresh count of primary shards + +**Corresponding metrics key:** `refresh_total_count` + +The cumulative number of refreshes of primary shards as reported by the Index Stats API under `_all/primaries`. + +## Cumulative refresh time across primary shards + +**Corresponding metrics key:** `refresh_total_time` (property: `per-shard`) + +The minimum, median, and maximum cumulative time for index refresh across primary shards as reported by the Index Stats API. + +## Cumulative flush time of primary shards + +**Corresponding metrics key:** `flush_total_time` + +The cumulative time used for index flush of primary shards as reported by the Index Stats API. Note that this is not wall-clock time. + +## Cumulative flush count of primary shards + +**Corresponding metrics key**: `flush_total_count` + +The cumulative number of flushes of primary shards as reported by the Index Stats API under `_all/primaries`. + + +## Cumulative flush time across primary shards + +**Corresponding metrics key:** `flush_total_time` (property: `per-shard`) + +The minimum, median, and maximum time for index flush across primary shards as reported by the Index Stats API. + +## Cumulative merge throttle time of primary shards + +**Corresponding metrics key:** `merges_total_throttled_time` + +The cumulative time within merges that have been throttled as reported by the Index Stats API. Note that this is not wall-clock time. + +## Cumulative merge throttle time across primary shards + +Corresponding metrics key: `merges_total_throttled_time` (property: `per-shard`) + +The minimum, median, and maximum cumulative time that merges have been throttled across primary shards as reported by the Index Stats API. + +## ML processing time + +Corresponding metrics key: `ml_processing_time` + +The minimum, mean, median, and maximum time in milliseconds that a machine learning (ML) job has spent processing a single bucket. + + +## Total young gen GC time + +**Corresponding metrics key**: `node_total_young_gen_gc_time` + +The total runtime of the young generation (gen) garbage collector (GC) across the whole cluster as reported by the Node Stats API. + +## Total young gen GC count + +**Corresponding metrics key:** `node_total_young_gen_gc_count` + +The total number of young gen GCs across the whole cluster as reported by the Node Stats API. + + +## Total old gen GC time + +**Corresponding metrics key:** `node_total_old_gen_gc_time` + +The total runtime of the old gen GC across the whole cluster as reported by the Node Stats API. + +## Total old gen GC count + +**Corresponding metrics key:** `node_total_old_gen_gc_count` + +The total number of old gen GCs across the whole cluster as reported by the Node Stats API. + +## Total ZGC cycles GC time + +**Corresponding metrics key**: `node_total_zgc_cycles_gc_count` + +The total number of garbage collections performed by the Z garbage collector (ZGC) across the whole cluster as reported by the Node Stats API. + +## Total ZGC pauses GC time + +**Corresponding metrics key**: `node_total_zgc_pauses_gc_time` + +The total time spent in stop-the-world pauses by the ZGC across the whole cluster as reported by the Node Stats API. + + +## Total ZGC pauses GC count + +**Corresponding metrics key**: `node_total_zgc_pauses_gc_count` + +The total number of stop-the-world pauses performed by the ZGC across the whole cluster as reported by the Node Stats API. + + +## Store size + +**Corresponding metrics key**: `store_size_in_bytes` + +The index size in bytes (excluding the translog) as reported by the Index Stats API. + +## Translog size + +**Corresponding metrics key**: `translog_size_in_bytes` + +The translog size in bytes as reported by the Index Stats API. + +## Heap used for X + +**Corresponding metrics keys**: `segments_*_in_bytes` + +The number of bytes used for the corresponding item as reported by the Index Stats API. The item may be any of the following: + +- Doc values +- Terms +- Norms +- Points +- Stored fields + + +## Segments count + +**Corresponding metrics key**: `segments_count` + +The total number of segments as reported by the Index Stats API. + + +## Total ingest pipeline count + +**Corresponding metrics key**: `ingest_pipeline_cluster_count` + +The total number of documents ingested by all nodes within the cluster over the race duration. + +## Total ingest pipeline time + +**Corresponding metrics key**: `ingest_pipeline_cluster_time` + +The total time in milliseconds spent preprocessing ingest documents by all nodes within the cluster over the race duration. + + +## Total ingest pipeline failed + +**Corresponding metrics key**: `ingest_pipeline_cluster_failed` + +The total number of failed ingest operations by all nodes within the cluster over the race duration. + + +## Throughput + +**Corresponding metrics key**: `throughput` + +Reports the minimum, mean, median, and maximum throughput for each task. + +The number of operations that OpenSearch can perform within a certain time period per second. The report includes the minimum, mean, median, and maximum throughput for each task. + + +## Latency + +**Corresponding metrics key**: `latency` + +The time period between submission of a request and receiving the complete response. It includes the wait time the request spends waiting before it is processed by OpenSearch. OpenSearch reports several percentile numbers for each task. Which percentiles are shown depends on how many requests OpenSearch can capture during the latency period. + + +## Service time + +**Corresponding metrics key**: `service_time` + +The time period between sending a request and receiving the corresponding response. It does not include waiting time. While many load testing tools refer to this metric as _latency_, it is not the same. OpenSearch reports several percentile numbers for each task. Which percentiles are shown depends on how many requests OpenSearch can capture during the latency period. + + + +## Processing time + +Processing time is only reported if the setting `output.processingtime` is set to `true` in the OpenSearch Benchmark configuration file. +{: note.} + +**Corresponding metrics key**: `processing_time` + + +The time period between start of request processing and retrieval of the complete response. Unlike `service_time`, this metric includes OpenSearch’s client-side processing overhead. The larger the difference between `service_time` and `processing_time`, the higher the overhead in the client. Depending on your processing goals, this can point to a potential client-side bottleneck that requires investigation. + + +## Error rate + +Corresponding metrics key: `service_time`. Each `service_time` record has a `meta.success` flag. + +The ratio of erroneous responses relative to the total number of responses. Any exception thrown by the Python OpenSearch client is considered erroneous, for example, HTTP response codes 4xx, 5xx, or network errors (network unreachable). You can investigate the root cause by inspecting OpenSearch and OpenSearch Benchmark logs and rerunning the benchmark. + + +## Disk usage + +**Corresponding metrics keys**: `disk_usage_total` +**Metric metadata**: `index` and `field` + +The total number of bytes that a single field uses on disk. Recorded for each field returned by the Disk Usage API even if the total is `0`. + + diff --git a/_benchmark/reference/telemetry.md b/_benchmark/reference/telemetry.md new file mode 100644 index 00000000..a5e9bbeb --- /dev/null +++ b/_benchmark/reference/telemetry.md @@ -0,0 +1,202 @@ +--- +layout: default +title: Telemetry devices +nav_order: 45 +parent: OpenSearch Benchmark Reference +--- + +# Telemetry devices + +Telemetry devices give you additional insights on benchmark results. To view a list of the available telemetry devices, use the command `opensearch-benchmark list telemetry`. + +All telemetry devices with a `--stats` can be used with clusters not provisioned by OpenSearch Benchmark. These devices are referred to as **Runtime level telemetry devices**. Alternatively, **Setup level telemetry devices** encompass devices that can only be used when OpenSearch Benchmark provisions a cluster. + +This page lists the telemetry devices supported by OpenSearch Benchmark. + + +## jfr + + +The `jfr` telemetry device enables the [Java Flight Recorder (JFR)](https://docs.oracle.com/javacomponents/jmc-5-5/jfr-runtime-guide/index.html) on the benchmark candidate. Up to Java Development Kit (JDK) 11, JFR ships only with Oracle JDK. OpenSearch Benchmark assumes that Oracle JDK is used for benchmarking. If you run benchmarks on JDK 11 or later, [JFR](https://jdk.java.net/jmc/) is also available on OpenJDK. + +To enable `jfr`, invoke **Workload** with the command `opensearch-benchmark workload --workload=pmc --telemetry jfr`. Then `jfr` will write a flight recording file that can be opened in Java Mission Control. OpenSearch Benchmark prints the location of the flight recording file on the command line. + +The `jfr` devices support the following parameters: + + +- `recording-template`: The name of a custom flight recording template. It is your responsibility to correctly install these recording templates on each target machine. If none is specified, the default recording JFR template is used. +- `jfr-delay`: The length of time to wait before starting to record. Optional. +- `jfr-duration`: The length of time to record. Optional. + + +## jit + + +The `jit` telemetry device enables JIT compiler logs for the benchmark candidate. If the HotSpot disassembler library is available, the logs will contain the disassembled JIT compiler output, which can be used for low-level analysis. + + +## gc + + +The `gc` telemetry device enables garbage collector (GC) logs for the benchmark candidate. You can use tools such as GCViewer to analyze the GC logs. + +If the runtime JDK is Java 9 or higher, you can specify the `gc-log-config` parameter. The GC logging configuration consists of a list of tags and levels, such as the default value `gc*=info,safepoint=info,age*=trace`. Run `java -Xlog:help` to view a list of available levels and tags. + + +## heapdump + + +The `heapdump` telemetry device captures a heap dump after a benchmark has finished and right before the node is shut down. + + +## node-stats + + +The `node-stats` telemetry device regularly calls the cluster [Node Stats API]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-stats/) and records metrics from the following stats and their associated keys: + +- Index stats: `indices` +- Thread pool stats: `thread_pool` +- JVM buffer pool stats: `jvm.buffer_pools` +- JVM gc stats: `jvm.gc` +- OS mem stats: `os.mem` +- OS cgroup stats: `os.cgroup` +- JVM mem stats: `jvm.mem` +- Circuit breaker stats: `breakers` +- Network-related stats: `transport` +- Process CPU stats: `process.cpu` + +The `node-stats` device supports the following parameters: + +- `node-stats-sample-interval`: A positive number greater than zero denoting the sampling interval in seconds. Default is `1`. +- `node-stats-include-indices`: A Boolean indicating whether index stats should be included. Default is `false`. +- `node-stats-include-indices-metrics`: A comma-separated string specifying the index stats metrics to include. This is useful, for example, to restrict the collected index stats metrics. Specifying this parameter implicitly enables collection of index stats, so you don’t also need to specify `node-stats-include-indices: true.` For example, `--telemetry-params="node-stats-include-indices-metrics:'docs'"` will collect the docs metrics from the index stats. If you want to use multiple fields, pass a JSON file to `telemetry-params`. Default is `docs,store,indexing,search,merges,query_cache,fielddata,segments,translog,request_cache`. +- `node-stats-include-thread-pools`: A Boolean indicating whether thread pool stats should be included. Default is `true`. +- `node-stats-include-buffer-pools`: A Boolean indicating whether buffer pool stats should be included. Default is `true`. +- `node-stats-include-breakers`: A Boolean indicating whether circuit breaker stats should be included. Default is `true`. +- `node-stats-include-gc`: A Boolean indicating whether JVM GC stats should be included. Default is `true`. +- `node-stats-include-mem`: A Boolean indicating whether both JVM heap and OS mem stats should be included. Default is `true`. +- `node-stats-include-cgroup`: A Boolean to include operating system cgroup stats. Memory stats are omitted since OpenSearch outputs them as string values. Use the `os_mem_*` fields instead. Default is `true`. +- `node-stats-include-network`: A Boolean indicating whether network-related stats should be included. Default is `true`. +- `node-stats-include-process`: A Boolean indicating whether process CPU stats should be included. Default is `true`. +- `node-stats-include-indexing-pressure`: A Boolean indicating whether indexing presser stats should be included. Default is `true`. + + +## recovery-stats + + +The `recovery-stats` telemetry device regularly calls the [CAT Recovery API]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-recovery/) and records one metrics document per shard. + +This telemetry device supports the following parameters: + +- `searchable-snapshots-stats-indices` A string with the index pattern, or list of index patterns, that searchable snapshots stats should additionally be collected from. If unset, only cluster-level stats will be collected. Default is `None`. +- `searchable-snapshots-stats-sample-interval`: A positive number greater than zero denoting the sampling interval in seconds. Default is `1`. + + +## shard-stats + + +The `shard-stats` telemetry device regularly calls the cluster [Node Stats API]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-stats/) using a `level=shard` cluster parameter and records one metrics document per shard. + +This device supports the `shard-stats-sample-interval` parameter, which defines the sampling interval in seconds. Default is `60`. + + +## data-stream-stats + + +The `data-stream-stats` telemetry device regularly calls the [Data Stream Stats API]({{site.url}}{{site.baseurl}}/im-plugin/data-streams/#step-2-create-a-data-stream) and records one metrics document for cluster-level stats (`_all`) and one metrics document per data stream. + +The following is an example of recorded documents given two data streams in the cluster: + +```json +{ + "data_streams" : [ + { + "name" : "logs-nginx", + "timestamp_field" : { + "name" : "request_time" + }, + "indices" : [ + { + "index_name" : ".ds-logs-nginx-000001", + "index_uuid" : "-VhmuhrQQ6ipYCmBhn6vLw" + } + ], + "generation" : 1, + "status" : "GREEN", + "template" : "logs-template-nginx" + } + ] +}, +{ + "name": "data-stream-stats", + "data_stream": "my-data-stream-1", + "backing_indices": 1, + "store_size_bytes": 439137, + "maximum_timestamp": 1579936446448 +}, +{ + "name": "data-stream-stats", + "data_stream": "my-data-stream-2", + "backing_indices": 1, + "store_size_bytes": 439199, + "maximum_timestamp": 1579936446448 +} +``` + +This telemetry device supports the `data-stream-stats-sample-interval` parameter, which defines the sampling interval in seconds. Default is `10`. + + +## ingest-pipeline-stats + + +The `ingest-pipeline-stats` telemetry device makes a call at the beginning and end of the benchmark to the Node Stats API and records the deltas in the form of the following documents: + +- Three results documents for each cluster: `ingest_pipeline_cluster_count`, `ingest_pipeline_cluster_time`, `ingest_pipeline_cluster_failed` +- One metrics document for each node’s respective stats: `ingest_pipeline_node_count`, `ingest_pipeline_node_time`, `ingest_pipeline_node_failed` +- One metrics document for each pipeline’s respective stats: `ingest_pipeline_pipeline_count`, `ingest_pipeline_pipeline_time`, `ingest_pipeline_pipeline_failed` +- One metrics document for each pipeline processor’s respective stats: `ingest_pipeline_processor_count`, `ingest_pipeline_processor_time`, `ingest_pipeline_processor_failed` + + +The following example shows each document record given a single cluster, single node, and single pipeline: + +```json +{ + "name": "ingest_pipeline_cluster_count", + "value": 1001, + "meta": { + "cluster_name": "docker-cluster" + } +}, +{ + "name": "ingest_pipeline_node_count", + "value": 1001, + "meta": { + "cluster_name": "docker-cluster", + "node_name": "node-001" + } +}, +{ + "name": "ingest_pipeline_pipeline_count", + "value": 1001, + "meta": { + "cluster_name": "docker-cluster", + "node_name": "node-001", + "ingest_pipeline": "test-pipeline-1" + } +}, +{ + "name": "ingest_pipeline_processor_count", + "value": 1001, + "meta": { + "cluster_name": "docker-cluster", + "node_name": "node-001", + "ingest_pipeline": "test-pipeline-1", + "processor_name": "uppercase_1", + "type": "uppercase" + } +} +``` + + + + diff --git a/_benchmark/reference/workloads/corpora.md b/_benchmark/reference/workloads/corpora.md new file mode 100644 index 00000000..daa2ae65 --- /dev/null +++ b/_benchmark/reference/workloads/corpora.md @@ -0,0 +1,60 @@ +--- +layout: default +title: corpora +parent: Workload reference +grand_parent: OpenSearch Benchmark Reference +nav_order: 70 +redirect_from: /benchmark/workloads/corpora/ +--- + + +# corpora + + +The `corpora` element contains all the document corpora used by the workload. You can use document corpora across workloads by copying and pasting any corpora definitions. + +## Example + +The following example defines a single corpus called `movies` with `11658903` documents and `1544799789` uncompressed bytes: + +```json + "corpora": [ + { + "name": "movies", + "documents": [ + { + "source-file": "movies-documents.json", + "document-count": 11658903, # Fetch document count from command line + "uncompressed-bytes": 1544799789 # Fetch uncompressed bytes from command line + } + ] + } + ] +``` + +## Configuration options + +Use the following options with `corpora`. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`name` | Yes | String | The name of the document corpus. Because OpenSearch Benchmark uses this name in its directories, use only lowercase names without white spaces. +`documents` | Yes | JSON array | An array of document files. +`meta` | No | String | A mapping of key-value pairs with additional metadata for a corpus. + + +Each entry in the `documents` array consists of the following options. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`source-file` | Yes | String | The file name containing the corresponding documents for the workload. When using OpenSearch Benchmark locally, documents are contained in a JSON file. When providing a `base_url`, use a compressed file format: `.zip`, `.bz2`, `.gz`, `.tar`, `.tar.gz`, `.tgz`, or `.tar.bz2`. The compressed file must have one JSON file containing the name. +`document-count` | Yes | Integer | The number of documents in the `source-file`, which determines which client indexes correlate to which parts of the document corpus. Each N client receives an Nth of the document corpus. When using a source that contains a document with a parent-child relationship, specify the number of parent documents. +`base-url` | No | String | An http(s), Amazon Simple Storage Service (Amazon S3), or Google Cloud Storage URL that points to the root path where OpenSearch Benchmark can obtain the corresponding source file. +`source-format` | No | String | Defines the format OpenSearch Benchmark uses to interpret the data file specified in `source-file`. Only `bulk` is supported. +`compressed-bytes` | No | Integer | The size, in bytes, of the compressed source file, indicating how much data OpenSearch Benchmark downloads. +`uncompressed-bytes` | No | Integer | The size, in bytes, of the source file after decompression, indicating how much disk space the decompressed source file needs. +`target-index` | No | String | Defines the name of the index that the `bulk` operation should target. OpenSearch Benchmark automatically derives this value when only one index is defined in the `indices` element. The value of `target-index` is ignored when the `includes-action-and-meta-data` setting is `true`. +`target-type` | No | String | Defines the document type of the target index targeted in bulk operations. OpenSearch Benchmark automatically derives this value when only one index is defined in the `indices` element and the index has only one type. The value of `target-type` is ignored when the `includes-action-and-meta-data` setting is `true`. +`includes-action-and-meta-data` | No | Boolean | When set to `true`, indicates that the document's file already contains an `action` line and a `meta-data` line. When `false`, indicates that the document's file contains only documents. Default is `false`. +`meta` | No | String | A mapping of key-value pairs with additional metadata for a corpus. + diff --git a/_benchmark/reference/workloads/index.md b/_benchmark/reference/workloads/index.md new file mode 100644 index 00000000..1dd609ca --- /dev/null +++ b/_benchmark/reference/workloads/index.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Workload reference +nav_order: 60 +parent: OpenSearch Benchmark Reference +has_children: true +redirect_from: /benchmark/workloads/index/ +--- + +# OpenSearch Benchmark workload reference + +A workload is a specification of one or more benchmarking scenarios. A workload typically includes the following: + +- One or more data streams that are ingested into indices +- A set of queries and operations that are invoked as part of the benchmark + +This section provides a list of options and examples you can use when customizing or using a workload. + +For more information about what comprises a workload, see [Anatomy of a workload](({{site.url}}{{site.baseurl}}/benchmark/understanding-workloads/anatomy-of-a-workload/). + + +## Workload examples + +If you want to try certain workloads before creating your own, use the following examples. + +### Running unthrottled + +In the following example, OpenSearch Benchmark runs an unthrottled bulk index operation for 1 hour against the `movies` index: + +```json +{ + "description": "Tutorial benchmark for OpenSearch Benchmark", + "indices": [ + { + "name": "movies", + "body": "index.json" + } + ], + "corpora": [ + { + "name": "movies", + "documents": [ + { + "source-file": "movies-documents.json", + "document-count": 11658903, # Fetch document count from command line + "uncompressed-bytes": 1544799789 # Fetch uncompressed bytes from command line + } + ] + } + ], + "schedule": [ + { + "operation": "bulk", + "warmup-time-period": 120, + "time-period": 3600, + "clients": 8 + } +] +} +``` + +### Workload with a single task + +The following workload runs a benchmark with a single task: a `match_all` query. Because no `clients` are indicated, only one client is used. According to the `schedule`, the workload runs the `match_all` query at 10 operations per second with 1 client, uses 100 iterations to warm up, and uses the next 100 iterations to measure the benchmark: + +```json +{ + "description": "Tutorial benchmark for OpenSearch Benchmark", + "indices": [ + { + "name": "movies", + "body": "index.json" + } + ], + "corpora": [ + { + "name": "movies", + "documents": [ + { + "source-file": "movies-documents.json", + "document-count": 11658903, # Fetch document count from command line + "uncompressed-bytes": 1544799789 # Fetch uncompressed bytes from command line + } + ] + } + ], +{ + "schedule": [ + { + "operation": { + "operation-type": "search", + "index": "_all", + "body": { + "query": { + "match_all": {} + } + } + }, + "warmup-iterations": 100, + "iterations": 100, + "target-throughput": 10 + } + ] +} +} +``` + +## Next steps + +- For more information about configuring OpenSearch Benchmark, see [Configuring OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/configuring-benchmark/). +- For a list of prepackaged workloads for OpenSearch Benchmark, see the [opensearch-benchmark-workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) repository. diff --git a/_benchmark/reference/workloads/indices.md b/_benchmark/reference/workloads/indices.md new file mode 100644 index 00000000..daaf6101 --- /dev/null +++ b/_benchmark/reference/workloads/indices.md @@ -0,0 +1,34 @@ +--- +layout: default +title: indices +parent: Workload reference +grand_parent: OpenSearch Benchmark Reference +nav_order: 65 +redirect_from: /benchmark/workloads/indices/ +--- + + +# indices + + +The `indices` element contains a list of all indices used in the workload. + +## Example + +```json +"indices": [ + { + "name": "geonames", + "body": "geonames-index.json", + } +] +``` + +## Configuration options + +Use the following options with `indices`: + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`name` | Yes | String | The name of the index template. +`body` | No | String | The file name corresponding to the index definition used in the body of the Create Index API. diff --git a/_benchmark/reference/workloads/operations.md b/_benchmark/reference/workloads/operations.md new file mode 100644 index 00000000..ed6e6b85 --- /dev/null +++ b/_benchmark/reference/workloads/operations.md @@ -0,0 +1,338 @@ +--- +layout: default +title: operations +parent: Workload reference +grand_parent: OpenSearch Benchmark Reference +nav_order: 100 +--- + + +# operations + + +The `operations` element contains a list of all available operations for specifying a schedule. + + +## bulk + + +The `bulk` operation type allows you to run [bulk](/api-reference/document-apis/bulk/) requests as a task. + +### Usage + +The following example shows a `bulk` operation type with a `bulk-size` of `5000` documents: + +```yml +{ + "name": "index-append", + "operation-type": "bulk", + "bulk-size": 5000 +} +``` + +### Split documents among clients + +When you have multiple `clients`, OpenSearch Benchmark splits each document based on the set number of clients. Having multiple `clients` parallelizes the bulk index operations but doesn't preserve the ingestion order of each document. For example, if `clients` is set to `2`, one client indexes the document starting from the beginning, while the other client indexes the document starting from the middle. + +If there are multiple documents or corpora, OpenSearch Benchmark attempts to index all documents in parallel in two ways: + +1. Each client starts at a different point in the corpus. For example, in a workload with 2 corpora and 5 clients, clients 1, 3, and 5 begin with the first corpus, whereas clients 2 and 4 start with the second corpus. +2. Each client is assigned to multiple documents. Client 1 starts with the first split of the first document of the first corpus. Then it moves to the first split of the first document of the second corpus, and so on. + +### Configuration options + +Use the following options to customize the `bulk` operation. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`bulk-size` | Yes | Number | Specifies the number of documents to be ingested in the bulk request. +`ingest-percentage` | No | Range [0, 100] | Defines the portion of the document corpus to be indexed. Valid values are numbers between 0 and 100. +`corpora` | No | List | Defines which document corpus names should be targeted by the bulk operation. Only needed if the `corpora` section contains more than one document corpus and you don’t want to index all of them during the bulk request. +`indices` | No | List | Defines which indexes should be used in the bulk index operation. OpenSearch Benchmark only selects document files that have a matching `target-index`. +`batch-size` | No | Number | Defines how many documents OpenSearch Benchmark reads simultaneously. This is an expert setting and is only meant to avoid accidental bottlenecks for very small bulk sizes. If you want to benchmark with a `bulk-size` of `1`, you should set a higher `batch-size`. +`pipeline` | No | String | Defines which existing ingest pipeline to use. +`conflicts` | No | String | Defines the type of index `conflicts` to simulate. If not specified, none are simulated. Valid values are ‘sequential’, which replaces a document ID with a sequentially increasing document ID, and ‘random’, which replaces a document ID with a random document ID. +`conflict-probability` | No | Percentage | Defines how many of the documents are replaced when a conflict exists. Combining `conflicts=sequential` and `conflict-probability=0` makes OpenSearch Benchmark generate the index ID itself instead of using OpenSearch's automatic ID generation. Valid values are numbers between 0 and 100. Default is `25%`. +`on-conflict` | No | String | Determines whether OpenSearch should use the action `index` or `update` index for ID conflicts. Default is `index`, which creates a new index during ID conflicts. +`recency` | No | Number | Uses a number between 0 and 1 to indicate recency. A recency closer to `1` biases conflicting IDs toward more recent IDs. A recency closer to 0 considers all IDs for ID conflicts. +`detailed-results` | No | Boolean | Records more detailed [metadata](#metadata) for bulk requests. As OpenSearch Benchmark analyzes the corresponding bulk response in more detail, additional overhead may be incurred, which can skew measurement results. This property must be set to `true` so that OpenSearch Benchmark logs individual bulk request failures. +`timeout` | No | Duration | Defines the amount of time (in minutes) that OpenSearch waits per action until completing the processing of the following operations: automatic index creation, dynamic mapping updates, and waiting for active shards. Default is `1m`. +`refresh` | No | String | Controls OpenSearch refresh behavior for bulk requests that use the `refresh` bulk API query parameter. Valid values are `true`, which refreshes target shards in the background; `wait_for`, which blocks bulk requests until affected shards have been refreshed; and `false`, which uses the default refresh behavior. + +### Metadata + +The `bulk` operation always returns the following metadata: + +- `index`: The name of the affected index. If an index cannot be derived, it returns `null`. +- `weight`: An operation-agnostic representation of the bulk size, denoted by `units`. +- `unit`: The unit used to interpret `weight`. +- `success`: A Boolean indicating whether the `bulk` request succeeded. +- `success-count`: The number of successfully processed bulk items for the request. This value is determined when there are errors or when the `bulk-size` has been specified in the documents. +- `error-count`: The number of failed bulk items for the request. +- `took`: The value of the `took` property in the bulk response. + +If `detailed-results` is `true`, the following metadata is returned: + +- `ops`: A nested document with the operation name as its key, such as `index`, `update`, or `delete`, and various counts as values. `item-count` contains the total number of items for this key. Additionally, OpenSearch Benchmark returns a separate counter for each result, for example, a result for the number of created items or the number of deleted items. +- `shards_histogram`: An array of hashes, each of which has two keys. The `item-count` key contains the number of items to which a shard distribution applies. The `shards` key contains a hash with the actual distribution of `total`, `successful`, and `failed` shards. +- `bulk-request-size-bytes`: The total size of the bulk request body, in bytes. +- `total-document-size-bytes`: The total size of all documents within the bulk request body, in bytes. + + +## create-index + + +The `create-index` operation runs the [Create Index API](/api-reference/index-apis/create-index/). It supports the following two modes of index creation: + +- Creating all indexes specified in the workloads `indices` section +- Creating one specific index defined within the operation itself + +### Usage + +The following example creates all indexes defined in the `indices` section of the workload. It uses all of the index settings defined in the workload but overrides the number of shards: + +```yml +{ + "name": "create-all-indices", + "operation-type": "create-index", + "settings": { + "index.number_of_shards": 1 + }, + "request-params": { + "wait_for_active_shards": "true" + } +} +``` + +The following example creates a new index with all index settings specified in the operation body: + +```yml +{ + "name": "create-an-index", + "operation-type": "create-index", + "index": "people", + "body": { + "settings": { + "index.number_of_shards": 0 + }, + "mappings": { + "docs": { + "properties": { + "name": { + "type": "text" + } + } + } + } + } +} +``` + +### Configuration options + +Use the following options when creating all indexes from the `indices` section of a workload. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`settings` | No | Array | Specifies additional index settings to be merged with the index settings specified in the `indices` section of the workload. +`request-params` | No | List of settings | Contains any request parameters allowed by the Create Index API. OpenSearch Benchmark does not attempt to serialize the parameters and passes them in their current state. + +Use the following options when creating a single index in the operation. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`index` | Yes | String | The index name. +`body` | No | Request body | The request body for the Create Index API. For more information, see [Create Index API](/api-reference/index-apis/create-index/). +`request-params` | No | List of settings | Contains any request parameters allowed by the Create Index API. OpenSearch Benchmark does not attempt to serialize the parameters and passes them in their current state. + +### Metadata + +The `create-index` operation returns the following metadata: + +`weight`: The number of indexes created by the operation. +`unit`: Always `ops`, indicating the number of operations inside the workload. +`success`: A Boolean indicating whether the operation has succeeded. + + +## delete-index + + +The `delete-index` operation runs the [Delete Index API](api-reference/index-apis/delete-index/). Like with the [`create-index`](#create-index) operation, you can delete all indexes found in the `indices` section of the workload or delete one or more indexes based on the string passed in the `index` setting. + +### Usage + +The following example deletes all indexes found in the `indices` section of the workload: + +```yml +{ + "name": "delete-all-indices", + "operation-type": "delete-index" +} +``` + +The following example deletes all `logs_*` indexes: + +```yml +{ + "name": "delete-logs", + "operation-type": "delete-index", + "index": "logs-*", + "only-if-exists": false, + "request-params": { + "expand_wildcards": "all", + "allow_no_indices": "true", + "ignore_unavailable": "true" + } +} +``` + +### Configuration options + +Use the following options when deleting all indexes indicated in the `indices` section of the workload. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`only-if-exists` | No | Boolean | Decides whether an existing index should be deleted. Default is `true`. +`request-params` | No | List of settings | Contains any request parameters allowed by the Create Index API. OpenSearch Benchmark does not attempt to serialize the parameters and passes them in their current state. + +Use the following options if you want to delete one or more indexes based on the pattern indicated in the `index` option. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`index` | Yes | String | The index or indexes that you want to delete. +`only-if-exists` | No | Boolean | Decides whether an index should be deleted when the index exists. Default is `true`. +`request-params` | No | List of settings | Contains any request parameters allowed by the Create Index API. OpenSearch Benchmark does not attempt to serialize the parameters and passes them in their current state. + +### Metadata + +The `delete-index` operation returns the following metadata: + +`weight`: The number of indexes created by the operation. +`unit`: Always `ops`, for the number of operations inside the workload. +`success`: A Boolean indicating whether the operation has succeeded. + + +## cluster-health + + +The `cluster-health` operation runs the [Cluster Health API](api-reference/cluster-api/cluster-health/), which checks the cluster health status and returns the expected status according to the parameters set for `request-params`. If an unexpected cluster health status is returned, the operation reports a failure. You can use the `--on-error` option in the OpenSearch Benchmark `execute-test` command to control how OpenSearch Benchmark behaves when the health check fails. + + +### Usage + +The following example creates a `cluster-health` operation that checks for a `green` health status on any `log-*` indexes: + +```yml +{ + "name": "check-cluster-green", + "operation-type": "cluster-health", + "index": "logs-*", + "request-params": { + "wait_for_status": "green", + "wait_for_no_relocating_shards": "true" + }, + "retry-until-success": true +} + +``` + +### Configuration options + +Use the following options with the `cluster-health` operation. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`index` | Yes | String | The index or indexes you want to assess. +`request-params` | No | List of settings | Contains any request parameters allowed by the Cluster Health API. OpenSearch Benchmark does not attempt to serialize the parameters and passes them in their current state. + +### Metadata + +The `cluster-health` operation returns the following metadata: + +`weight`: The number of indexes the `cluster-health` operation assesses. Alwasys `1`, since the operation runs once per index. +`unit`: Always `ops`, for the number of operations inside the workload. +`success`: A Boolean indicating whether the operation has succeeded. +- `cluster-status`: The current cluster status. +- `relocating-shards`: The number of shards currently relocating to a different node. + + +## refresh + + +The `refresh` operation runs the Refresh API. The `operation` returns no metadata. + +### Usage + +The following example refreshes all `logs-*` indexes: + +```yml +{ + "name": "refresh", + "operation-type": "refresh", + "index": "logs-*" +} +``` + +### Configuration options + +The `refresh` operation uses the following options. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`index` | No | String | The names of the indexes or data streams to refresh. + + +## search + + +The `search` operation runs the [Search API](/api-reference/search/), which you can use to run queries in OpenSearch Benchmark indexes. + +### Usage + +The following example runs a `match_all` query inside the `search` operation: + +```yml +{ + "name": "default", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + }, + "request-params": { + "_source_include": "some_field", + "analyze_wildcard": "false" + } +} +``` + +### Configuration options + +The `search` operation uses the following options. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`index` | No | String | The indexes or data streams targeted by the query. This option is needed only when the `indices` section contains two or more indexes. Otherwise, OpenSearch Benchmark automatically derives the index or data stream to use. Specify `"index": "_all"` to query against all indexes in the workload. +`cache` | No | Boolean | Specifies whether to use the query request cache. OpenSearch Benchmark defines no value. The default depends on the benchmark candidate settings and the OpenSearch version. +`request-params` | No | List of settings | Contains any request parameters allowed by the Search API. +`body` | Yes | Request body | Indicates which query and query parameters to use. +`detailed-results` | No | Boolean | Records more detailed metadata about queries. When set to `true`, additional overhead may be incurred, which can skew measurement results. This option does not work with `scroll` queries. +`results-per-page` | No | Integer | Specifies the number of documents to retrieve per page. This maps to the Search API `size` parameter and can be used for scroll and non-scroll searches. Default is `10`. + +### Metadata + +The following metadata is always returned: + +- `weight`: The “weight” of an operation. Always `1` for regular queries and the number of retrieved pages for scroll queries. +- `unit`: The unit used to interpret weight, which is `ops` for regular queries and `pages` for scroll queries. +- `success`: A Boolean indicating whether the query has succeeded. + +If `detailed-results` is set to `true`, the following metadata is also returned: + +- `hits`: The total number of hits for the query. +- `hits_relation`: Whether the number of hits is accurate (eq) or a lower bound of the actual hit count (gte). +- `timed_out`: Whether the query has timed out. For scroll queries, this flag is `true` if the flag was `true` for any of the queries issued. + - `took`: The value of the `took` property in the query response. For scroll queries, the value is the sum of all `took` values in all query responses. + + diff --git a/_benchmark/reference/workloads/test-procedures.md b/_benchmark/reference/workloads/test-procedures.md new file mode 100644 index 00000000..43099f0a --- /dev/null +++ b/_benchmark/reference/workloads/test-procedures.md @@ -0,0 +1,194 @@ +--- +layout: default +title: test_procedures +parent: Workload reference +grand_parent: OpenSearch Benchmark Reference +nav_order: 110 +--- + + +# test_procedures + + +If your workload only defines one benchmarking scenario, specify the schedule at the top level. Use the `test-procedures` element to specify additional properties, such as a name or description. A test procedure is like a benchmarking scenario. If you have multiple test procedures, you can define a variety of challenges. + +The following table lists test procedures for the benchmarking scenarios in this dataset. A test procedure can reference all operations that are defined in the operations section. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`name` | Yes | String | The name of the test procedure. When naming the test procedure, do not use spaces; this ensures that the name can be easily entered on the command line. +`description` | No | String | Describes the test procedure in a human-readable format. +`user-info` | No | String | Outputs a message at the start of the test to notify you about important test-related information, for example, deprecations. +`default` | No | Boolean | When set to `true`, selects the default test procedure if you did not specify a test procedure on the command line. If the workload only defines one test procedure, it is implicitly selected as the default. Otherwise, you must define `"default": true` on exactly one challenge. +[`schedule`](#Schedule) | Yes | Array | Defines the order in which workload tasks are run. + + +## schedule + + +The `schedule` element contains a list of a tasks, which are operations supported by OpenSearch Benchmark, that are run by the workload during the benchmark test. + +### Usage + +The `schedule` element defines tasks using the methods described in this section. + +#### Using the operations element + +The following example defines a `force-merge` and `match-all` query task using the `operations` element. The `force-merge` operation does not use any parameters, so only the `name` and `operation-type` are needed. The `match-all-query` parameter requires a query `body` and `operation-type`. + +Operations defined in the `operations` element can be reused in the schedule more than once: + +```yml +{ + "operations": [ + { + "name": "force-merge", + "operation-type": "force-merge" + }, + { + "name": "match-all-query", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + } + ], + "schedule": [ + { + "operation": "force-merge", + "clients": 1 + }, + { + "operation": "match-all-query", + "clients": 4, + "warmup-iterations": 1000, + "iterations": 1000, + "target-throughput": 100 + } + ] +} +``` + +#### Defining operations inline + +If you don't want to reuse an operation in the schedule, you can define operations inside the `schedule` element, as shown in the following example: + +```yml +{ + "schedule": [ + { + "operation": { + "name": "force-merge", + "operation-type": "force-merge" + }, + "clients": 1 + }, + { + "operation": { + "name": "match-all-query", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + "clients": 4, + "warmup-iterations": 1000, + "iterations": 1000, + "target-throughput": 100 + } + ] +} +``` + +### Task options + +Each task contains the following options. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`operation` | Yes | List | Either refers to the name of an operation, defined in the `operations` element, or includes the entire operation inline. +`name` | No | String | Specifies a unique name for the task when multiple tasks use the same operation. +`tags` | No | String | Unique identifiers that can be used to filter between `tasks.clients` or the number of clients that should execute a task concurrently. Default is 1. +`clients` | No | Integer | Specifies the number of clients that will run the task concurrently. Default is `1`. + +### Target options + +OpenSearch Benchmark requires one of the following options when running a task: + +`target-throughput` | No | Integer | Defines the benchmark mode. When not defined, OpenSearch Benchmark assumes that it is a throughput benchmark and runs the task as fast as possible. This is useful for batch operations, where achieving better throughput is preferred over better latency. When defined, the target specifies the number of requests per second across all clients. For example, if you specify `target-throughput: 1000` with 8 clients, each client issues 125 (= 1000 / 8) requests per second. +`target-interval` | No | Interval | Defines an interval of 1 divided by the `target-throughput` (in seconds) when the `target-throughput` is less than 1 operation per second. Define either `target-throughput` or `target-interval` but not both, otherwise OpenSearch Benchmark raises an error. +`ignore-response-error-level` | No | Boolean | Controls whether to ignore errors encountered during the task when a benchmark is run with the `on-error=abort` command flag. + +### Iteration-based options + +Iteration-based options determine the number of times that an operation should run. It can also define the number of iterative runs when tasks are run in [parallel](#parallel-tasks). To configure an iteration-based schedule, use the following options. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`iterations` | No | Integer | Specifies the number of times that a client should execute an operation. All iterations are included in the measured results. Default is `1`. +`warmup-iterations` | No | Integer | Specifies the number of times that a client should execute an operation in order to warm up the benchmark candidate. The `warmup-iterations` do not appear in the measurement results. Default is `0`. + +### Parallel tasks + +The `parallel` element concurrently runs tasks wrapped inside the element. + +When running tasks in parallel, each task requires the `client` option in order to ensure that clients inside your benchmark are reserved for that task. Otherwise, when the `client` option is specified inside the `parallel` element without a connection to the task, the benchmark uses that number of clients for all tasks. + +#### Usage + +In the following example, `parallel-task-1` and `parallel-task-2` execute a `bulk` operation concurrently: + +```yml +{ + "name": "parallel-any", + "description": "Workload completed-by property", + "schedule": [ + { + "parallel": { + "tasks": [ + { + "name": "parellel-task-1", + "operation": { + "operation-type": "bulk", + "bulk-size": 1000 + }, + "clients": 8 + }, + { + "name": "parellel-task-2", + "operation": { + "operation-type": "bulk", + "bulk-size": 500 + }, + "clients": 8 + } + ] + } + } + ] +} +``` + +#### Options + +The `parallel` element supports all `schedule` parameters, in addition to the following options. + +`tasks` | Yes | Array | Defines a list of tasks that should be executed concurrently. +`completed-by` | No | String | Allows you to define the name of one task in the task list or the value `any`. If `completed-by` is set to the name of one task in the list, the `parallel-task` structure is considered to be complete once that specific task has been completed. If `completed-by` is set to `any`, the `parallel-task` structure is considered to be complete when any one of the tasks in the list has been completed. If `completed-by` is not explicitly defined, the `parallel-task` structure is considered to be complete as soon as all of the tasks in the list have been completed. + +### Time-based options + +Time-based options determine the duration of time, in seconds, for which operations should run. This is ideal for batch-style operations, which may require an additional warmup period. + +To configure a time-based schedule, use the following options. + +Parameter | Required | Type | Description +:--- | :--- | :--- | :--- +`time-period` | No | Integer | Specifies the time period, in seconds, that OpenSearch Benchmark considers for measurement. This is not required for bulk indexing because OpenSearch Benchmark bulk indexes all documents and naturally measures all samples after the specified `warmup-time-period`. +`ramp-up-time-period` | No | Integer | Specifies the time period, in seconds, during which OpenSearch Benchmark gradually adds clients and reaches the total number of clients specified for the operation. +`warmup-time-period` | No | Integer | Specifies the amount of time, in seconds, to warm up the benchmark candidate. None of the response data captured during the warmup period appears in the measurement results. + diff --git a/_benchmark/tutorials/index.md b/_benchmark/tutorials/index.md new file mode 100644 index 00000000..3e53db2e --- /dev/null +++ b/_benchmark/tutorials/index.md @@ -0,0 +1,10 @@ +--- +layout: default +title: Tutorials +nav_order: 10 +has_children: true +--- + +# Tutorial + +This section of the OpenSearch Benchmark documentation provides a set of tutorials for those who want to learn more advanced OpenSearch Benchmark concepts. \ No newline at end of file diff --git a/_benchmark/tutorials/sigv4.md b/_benchmark/tutorials/sigv4.md new file mode 100644 index 00000000..f7ef38f9 --- /dev/null +++ b/_benchmark/tutorials/sigv4.md @@ -0,0 +1,45 @@ +--- +layout: default +title: AWS Signature Version 4 support +nav_order: 70 +parent: Tutorials +--- + +# Running OpenSearch Benchmark with AWS Signature Version 4 + +OpenSearch Benchmark supports AWS Signature Version 4 authentication. To run Benchmark with Signature Version 4, use the following steps: + +1. Set up an [IAM user or an IAM Role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create.html) and provide it access to the OpenSearch cluster using Signature Version 4 authentication. + +2. Set up the following environment variables for your IAM user: + + ```bash + export OSB_AWS_ACCESS_KEY_ID= + export OSB_AWS_SECRET_ACCESS_KEY= + export OSB_REGION= + export OSB_SERVICE=es + ``` + {% include copy.html %} + + If you want to set up an IAM role instead of an IAM user, use the following environment variables instead: + + ```bash + export OSB_AWS_ACCESS_KEY_ID= + export OSB_AWS_SECRET_ACCESS_KEY= + export OSB_AWS_SESSION_TOKEN= + export OSB_REGION= + export OSB_SERVICE=es + ``` + {% include copy.html %} + + If you're testing against Amazon OpenSearch Serverless, set `OSB_SERVICE` to `aoss`. + +3. Customize and run the following `execute-test` command with the ` --client-options=amazon_aws_log_in:environment` flag. This flag tells OpenSearch Benchmark the location of your exported credentials. + + ```bash + opensearch-benchmark execute-test \ + --target-hosts= \ + --pipeline=benchmark-only \ + --workload=geonames \ + --client-options=timeout:120,amazon_aws_log_in:environment \ + ``` diff --git a/_benchmark/user-guide/concepts.md b/_benchmark/user-guide/concepts.md new file mode 100644 index 00000000..b353538a --- /dev/null +++ b/_benchmark/user-guide/concepts.md @@ -0,0 +1,114 @@ +--- +layout: default +title: Concepts +nav_order: 3 +parent: User guide +--- + +# Concepts + +Before using OpenSearch Benchmark, familiarize yourself with the following concepts. + +## Core concepts and definitions + +- **Workload**: The description of one or more benchmarking scenarios that use a specific document corpus to perform a benchmark against your cluster. The document corpus contains any indexes, data files, and operations invoked when the workflow runs. You can list the available workloads by using `opensearch-benchmark list workloads` or view any included workloads in the [OpenSearch Benchmark Workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/). For more information about the elements of a workload, see [Anatomy of a workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/anatomy-of-a-workload/). For information about building a custom workload, see [Creating custom workloads]({{site.url}}{{site.baseurl}}/benchmark/creating-custom-workloads/). + +- **Pipeline**: A series of steps occurring before and after a workload is run that determines benchmark results. OpenSearch Benchmark supports three pipelines: + - `from-sources`: Builds and provisions OpenSearch, runs a benchmark, and then publishes the results. + - `from-distribution`: Downloads an OpenSearch distribution, provisions it, runs a benchmark, and then publishes the results. + - `benchmark-only`: The default pipeline. Assumes an already running OpenSearch instance, runs a benchmark on that instance, and then publishes the results. + +- **Test**: A single invocation of the OpenSearch Benchmark binary. + +A workload is a specification of one or more benchmarking scenarios. A workload typically includes the following: + +- One or more data streams that are ingested into indexes. +- A set of queries and operations that are invoked as part of the benchmark. + +## Throughput and latency + +At the end of each test, OpenSearch Benchmark produces a table that summarizes the following: + +- [Service time](#service-time) +- Throughput +- [Latency](#latency) +- The error rate for each completed task or OpenSearch operation. + +While the definition for _throughput_ remains consistent with other client-server systems, the definitions for `service time` and `latency` differ from most client-server systems in the context of OpenSearch Benchmark. The following table compares the OpenSearch Benchmark definition of service time and latency versus the common definitions for a client-server system. + +| Metric | Common definition | **OpenSearch Benchmark definition** | +| :--- | :--- |:--- | +| **Throughput** | The number of operations completed in a given period of time. | The number of operations completed in a given period of time. | +| **Service time** | The amount of time that the server takes to process a request, from the point it receives the request to the point the response is returned.

It includes the time spent waiting in server-side queues but _excludes_ network latency, load balancer overhead, and deserialization/serialization. | The amount of time that it takes for `opensearch-py` to send a request and receive a response from the OpenSearch cluster.

It includes the amount of time that it takes for the server to process a request and also _includes_ network latency, load balancer overhead, and deserialization/serialization. | +| **Latency** | The total amount of time, including the service time and the amount of time that the request waited before responding. | Based on the `target-throughput` set by the user, the total amount of time that the request waited before receiving the response, in addition to any other delays that occured before the request is sent. | + +For more information about service time and latency in OpenSearch Benchmark, see the [Service time](#service-time) and [Latency](#latency) sections. + + +### Service time + +OpenSearch Benchmark does not have insight into how long OpenSearch takes to process a request, apart from extracting the `took` time for the request. In OpenSearch, **service time** tracks the amount of time between when OpenSearch issues a request and receives a response. + +OpenSearch Benchmark makes function calls to `opensearch-py` to communicate with an OpenSearch cluster. OpenSearch Benchmark tracks the amount of time between when the `opensearch-py` client sends a request and receives a response from the OpenSearch cluster and considers this to be the service time. Unlike the traditional definition of service time, the OpenSearch Benchmark definition of service time includes overhead, such as network latency, load balancer overhead, or deserialization/serialization. The following image highlights the differences between the traditional definition of service time and the OpenSearch Benchmark definition of service time. + + + +### Latency + +Target throughput is key to understanding the OpenSearch Benchmark definition of **latency**. Target throughput is the rate at which OpenSearch Benchmark issues requests, assuming that responses will be returned instantaneously. `target-throughput` is one of the common workload parameters that can be set for each test and is measured in operations per second. + +OpenSearch Benchmark always issues one request at a time for a single client thread, specified as `search-clients` in the workload parameters. If `target-throughput` is set to `0`, OpenSearch Benchmark issues a request immediately after it receives the response from the previous request. If the `target-throughput` is not set to `0`, OpenSearch Benchmark issues the next request to match the `target-throughput`, assuming that responses are returned instantaneously. + +#### Example A + +The following diagrams illustrate how latency is calculated with an expected request response time of 200ms and the following settings: + +- `search-clients` is set to `1`. +- `target-throughput` is set to `1` operation per second. + + + +When a request takes longer than 200ms, such as when a request takes 1110ms instead of 400ms, OpenSearch Benchmark sends the next request that was supposed to occur at 4.00s based on the `target-throughput` at 4.10s. All subsequent requests after the 4.10s request attempt to resynchronize with the `target-throughput` setting. + + + +When measuring the overall latency, OpenSearch Benchmark includes all performed requests. All requests have a latency of 200ms, except for the following two requests: + +- The request that lasted 1100ms. +- The subsquent request that was supposed to start at 4:00s. This request was delayed by 100ms, denoted by the orange area in the following diagram, and had a response time of 200ms. When calculating the latency for this request, OpenSearch Benchmark will account for the delayed start time and combine it with the response time. Thus, the latency for this request is **300ms**. + + + +#### Example B + +In this example, OpenSearch Benchmark assumes a latency of 200ms and uses the following latency settings: + +- `search_clients` is set to `1`. +- `target-throughput` is set to `10` operations per second. + +The following diagram shows the schedule built by OpenSearch Benchmark with the expected response times. + + + +However, if the assumption is that all responses will take 200ms, 10 operations per second won't be possible. Therefore, the highest throughput OpenSearch Benchmark can reach is 5 operations per second, as shown in the following diagram. + + + +OpenSearch Benchmark does not account for this and continues to try to achieve the `target-throughput` of 10 operations per second. Because of this, delays for each request begin to cascade, as illustrated in the following diagram. + + + +Combining the service time with the delay for each operation provides the following latency measurements for each operation: + +- 200 ms for operation 1 +- 300 ms for operation 2 +- 400 ms for operation 3 +- 500 ms for operation 4 +- 600 ms for operation 5 + +This latency cascade continues, increasing latency by 100ms for each subsequent request. + +### Recommendation + +As shown by the preceding examples, you should be aware of the average service time of each task and provide a `target-throughput` that accounts for the service time. The OpenSearch Benchmark latency is calculated based on the `target-throughput` set by the user, that is, the latency could be redefined as "throughput-based latency." + diff --git a/_benchmark/user-guide/configuring-benchmark.md b/_benchmark/user-guide/configuring-benchmark.md new file mode 100644 index 00000000..4cbf223b --- /dev/null +++ b/_benchmark/user-guide/configuring-benchmark.md @@ -0,0 +1,218 @@ +--- +layout: default +title: Configuring OpenSearch Benchmark +nav_order: 7 +parent: User guide +redirect_from: /benchmark/configuring-benchmark/ +--- + +# Configuring OpenSearch Benchmark + +OpenSearch Benchmark configuration data is stored in `~/.benchmark/benchmark.ini`, which is automatically created the first time OpenSearch Benchmark runs. + +The file is separated into the following sections, which you can customize based on the needs of your cluster. + + +## meta + + +This section contains meta information about the configuration file. + +| Parameter | Type | Description | +| :---- | :---- | :---- | +| `config.version` | Integer | The version of the configuration file format. This property is managed by OpenSearch Benchmark and should not be changed. | + + +## system + + +This section contains global information for the current benchmark environment. This information should be identical on all machines on which OpenSearch Benchmark is installed. + +| Parameter | Type | Description | +| :---- | :---- | :---- | +| `env.name` | String | The name of the benchmark environment used as metadata in metrics documents when an OpenSearch metrics store is configured. Only alphanumeric characters are allowed. Default is `local`. | +| `available.cores` | Integer | Determines the number of available CPU cores. OpenSearch Benchmark aims to create one asyncio event loop per core and distributes it to clients evenly across event loops. Defaults to the number of logical CPU cores for your cluster. | +| `async.debug` | Boolean | Enables debug mode on OpenSearch Benchmark's asyncio event loop. Default is `false`. | +| `passenv` | String | A comma-separated list of environment variable names that should be passed to OpenSearch for processing. | + + +## node + + +This section contains node-specific information that can be customized according to the needs of your cluster. + +| Parameter | Type | Description | +| :---- | :---- | :---- | +| `root.dir` | String | The directory that stores all OpenSearch Benchmark data. OpenSearch Benchmark assumes control over this directory and all its subdirectories. | +| `src.root.dir` | String | The directory from which the OpenSearch source code and any OpenSearch plugins are called. Only relevant for benchmarks from [sources](#source). | + + +## source + + +This section contains more details about the OpenSearch source tree. + +| Parameter | Type | Description | +| :---- | :---- | :---- | +| `remote.repo.url` | URL | The URL from which to check out OpenSearch. Default is `https://github.com/opensearch-project/OpenSearch.git`. +| `opensearch.src.subdir` | String | The local path relative to the `src.root.dir` of the OpenSearch search tree. Default is `OpenSearch`. +| `cache` | Boolean | Enables OpenSearch's internal source artifact cache, `opensearch*.tar.gz`, and any plugin zip files. Artifacts are cached based on their Git revision. Default is `true`. | +| `cache.days` | Integer | The number of days that an artifact should be kept in the source artifact cache. Default is `7`. | + + +## benchmarks + + +This section contains the settings that can be customized in the OpenSearch Benchmark data directory. + +| Parameter | Type | Description | +| :---- | :---- | :---- | +| `local.dataset.cache` | String | The directory in which benchmark datasets are stored. Depending on the benchmarks that are run, this directory may contain hundreds of GB of data. Default path is `$HOME/.benchmark/benchmarks/data`. | + + +## results_publishing + + +This section defines how benchmark metrics are stored. + +| Parameter | Type | Description | +| :---- | :---- | :---- | +| `datastore.type` | String | If set to `in-memory` all metrics are kept in memory while running the benchmark. If set to `opensearch` all metrics are instead written to a persistent metrics store and the data is made available for further analysis. Default is `in-memory`. | +| `sample.queue.size` | Function | The number of metrics samples that can be stored in OpenSearch Benchmark’s in-memory queue. Default is `2^20`. | +| metrics.request.downsample.factor | Integer| (default: 1): Determines how many service time and latency samples are saved in the metrics store. By default, all values are saved. If you want to, for example. keep only every 100th sample, specify `100`. This is useful to avoid overwhelming the metrics store in benchmarks with many clients. Default is `1`. | +| `output.processingtime` | Boolean | If set to `true`, OpenSearch shows the additional metric processing time in the command line report. Default is `false`. | + + +### `datastore.type` parameters + + +When `datastore.type` is set to `opensearch`, the following reporting settings can be customized. + +| Parameter | Type | Description | +| :---- | :---- | :---- | +| `datastore.host` | IP address | The hostname of the metrics store, for example, `124.340.200.22`. | +| datastore.port| Port | The port number of the metrics store, for example, `9200`. | +| `datastore.secure` | Boolean | If set to `false`, OpenSearch assumes an HTTP connection. If set to true, it assumes an HTTPS connection. | +| `datastore.ssl.verification_mode` | String | When set to the default `full`, the metrics store’s SSL certificate is checked. To disable certificate verification, set this value to `none`. | +| `datastore.ssl.certificate_authorities` | String | Determines the local file system path to the certificate authority’s signing certificate. +| `datastore.user` | Username | Sets the username for the metrics store | +| `datastore.password` | String | Sets the password for the metrics store. Alternatively, this password can be configured using the `OSB_DATASTORE_PASSWORD` environment variable, which avoids storing credentials in a plain text file. The environment variable takes precedence over the config file if both define a password. | +| `datastore.probe.cluster_version` | String | Enables automatic detection of the metrics store’s version. Default is `true`. | +| `datastore.number_of_shards` | Integer | The number of primary shards that the `opensearch-*` indexes should have. Any updates to this setting after initial index creation will only be applied to new `opensearch-*` indexes. Default is the [OpenSearch static index value]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#static-index-level-index-settings). | +| `datastore.number_of_replicas` | Integer | The number of replicas each primary shard in the datastore contains. Any updates to this setting after initial index creation will only be applied to new `opensearch-* `indexes. Default is the [OpenSearch static index value]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#static-index-level-index-settings). | + +### Examples + +You can use the following examples to set reporting values in your cluster. + +This example defines an unprotected metrics store in the local network: + +``` +[results_publishing] +datastore.type = opensearch +datastore.host = 192.168.10.17 +datastore.port = 9200 +datastore.secure = false +datastore.user = +datastore.password = +``` + +This example defines a secure connection to a metrics store in the local network with a self-signed certificate: + +``` +[results_publishing] +datastore.type = opensearch +datastore.host = 192.168.10.22 +datastore.port = 9200 +datastore.secure = true +datastore.ssl.verification_mode = none +datastore.user = user-name +datastore.password = the-password-to-your-cluster +``` + + +## workloads + + +This section defines how workloads are retrieved. All keys are read by OpenSearch using the syntax `<>.url`, which you can select using the OpenSearch Benchmark CLI `--workload-repository=workload-repository-name"` option. By default, OpenSearch chooses the workload repository using the `default.url` `https://github.com/opensearch-project/opensearch-benchmark-workloads`. + + +## defaults + + +This section defines the default values of certain OpenSearch Benchmark CLI parameters. + +| Parameter | Type | Description | +| :---- | :---- | :---- | +| `preserve_benchmark_candidate` | Boolean | Determines whether OpenSearch installations are preserved or wiped by default after a benchmark. To preserve an installation for a single benchmark, use the command line flag `--preserve-install`. Default is `false`. + + +## distributions + + +This section defines how OpenSearch versions are distributed. + +| Parameter | Type | Description | +| :---- | :---- | :---- | +| `release.cache` | Boolean | Determines whether newly released OpenSearch versions should be cached locally. | + +## Proxy configurations + +OpenSearch automatically downloads all the necessary proxy data for you, including: + +- OpenSearch distributions, when you specify `--distribution-version=`. +- OpenSearch source code, when you specify a Git revision number, for example, `--revision=1e04b2w`. +- Any metadata tracked from the [OpenSearch GitHub repository](https://github.com/opensearch-project/OpenSearch). + +As of OpenSearch Benchmark 0.5.0, only `http_proxy` is supported. +{: .warning} + +You can use an `http_proxy` to connect OpenSearch Benchmark to a specific proxy and connect the proxy to a benchmark workload. To add the proxy: + + +1. Add your proxy URL to your shell profile: + + ``` + export http_proxy=http://proxy.proxy.org:4444/ + ``` + +2. Source your shell profile and verify that the proxy URL is set correctly: + + ``` + source ~/.bash_profile ; echo $http_proxy + ``` + +3. Configure Git to connect to your proxy by using the following command. For more information, see the [Git documentation](https://git-scm.com/docs/git-config). + + ``` + git config --global http_proxy $http_proxy + ``` + +4. Use `git clone` to clone the workloads repository by using the following command. If the proxy configured correctly, the clone is successful. + + ``` + git clone http://github.com/opensearch-project/opensearch-benchmark-workloads.git + ``` + +5. Lastly, verify that OpenSearch Benchmark can connect to the proxy server by checking the `/.benchmark/logs/benchmark.log` log. When OpenSearch Benchmark starts, you should see the following at the top of the log: + + ``` + Connecting via proxy URL [http://proxy.proxy.org:4444/] to the Internet (picked up from the environment variable [http_proxy]). + ``` + +## Logging + +Logs from OpenSearch Benchmark can be configured in the `~/.benchmark/logging.json` file. For more information about how to format the log file, see the following Python documentation: + +- For general tips and tricks, use the [Python Logging Cookbook](https://docs.python.org/3/howto/logging-cookbook.html). +- For the file format, see the Python [logging configuration schema](https://docs.python.org/3/library/logging.config.html#logging-config-dictschema). +- For instructions on how to customize where the log output is written, see the [logging handlers documentation](https://docs.python.org/3/library/logging.handlers.html). + +By default, OpenSearch Benchmark logs all output to `~/.benchmark/logs/benchmark.log`. + + + + + + + diff --git a/_benchmark/user-guide/contributing-workloads.md b/_benchmark/user-guide/contributing-workloads.md new file mode 100644 index 00000000..e60f60ea --- /dev/null +++ b/_benchmark/user-guide/contributing-workloads.md @@ -0,0 +1,57 @@ +--- +layout: default +title: Sharing custom workloads +nav_order: 11 +parent: User guide +--- + +# Sharing custom workloads + +You can share a custom workload with other OpenSearch users by uploading it to the [workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/) on GitHub. + +Make sure that any data included in the workload's dataset does not contain proprietary data or personally identifiable information (PII). + +To share a custom workload, follow these steps. + +## Create a README.md + +Provide a detailed `README.MD` file that includes the following: + +- The purpose of the workload. When creating a description for the workload, consider its specific use and how the that use case differs from others in the [workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/). +- An example document from the dataset that helps users understand the data's structure. +- The workload parameters that can be used to customize the workload. +- A list of default test procedures included in the workload as well as other test procedures that the workload can run. +- An output sample produced by the workload after a test is run. +- A copy of the open-source license that gives the user and OpenSearch Benchmark permission to use the dataset. + +For an example workload README file, go to the `http_logs` [README](https://github.com/opensearch-project/opensearch-benchmark-workloads/blob/main/http_logs/README.md). + +## Verify the workload's structure + +The workload must include the following files: + +- `workload.json` +- `index.json` +- `files.txt` +- `test_procedures/default.json` +- `operations/default.json` + +Both `default.json` file names can be customized to have a descriptive name. The workload can include an optional `workload.py` file to add more dynamic functionality. For more information about a file's contents, go to [Anatomy of a workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/anatomy-of-a-workload/). + +## Testing the workload + +All workloads contributed to OpenSearch Benchmark must fulfill the following testing requirements: + +- All tests run to explore and produce an example from the workload must target an OpenSearch cluster. +- The workload must pass all integration tests. Follow these steps to ensure that the workload passes the integration tests: + 1. Add the workload to your forked copy of the [workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/). Make sure that you've forked both the `opensearch-benchmark-workloads` repository and the [OpenSeach Benchmark](https://github.com/opensearch-project/opensearch-benchmark) repository. + 3. In your forked OpenSearch Benchmark repository, update the `benchmark-os-it.ini` and `benchmark-in-memory.ini` files in the `/osbenchmark/it/resources` directory to point to the forked workloads repository containing your workload. + 4. After you've modified the `.ini` files, commit your changes to a branch for testing. + 6. Run your integration tests using GitHub actions by selecting the branch for which you committed your changes. Verify that the tests have run as expected. + 7. If your integration tests run as expected, go to your forked workloads repository and merge your workload changes into branches `1` and `2`. This allows for your workload to appear in both major versions of OpenSearch Benchmark. + +## Create a PR + +After testing the workload, create a pull request (PR) from your fork to the `opensearch-project` [workloads repository](https://github.com/opensearch-project/opensearch-benchmark-workloads/). Add a sample output and summary result to the PR description. The OpenSearch Benchmark maintainers will review the PR. + +Once the PR is approved, you must share the data corpora of your dataset. The OpenSearch Benchmark team can then add the dataset to a shared S3 bucket. If your data corpora is stored in an S3 bucket, you can use [AWS DataSync](https://docs.aws.amazon.com/datasync/latest/userguide/create-s3-location.html) to share the data corpora. Otherwise, you must inform the maintainers of where the data corpora resides. diff --git a/_benchmark/user-guide/creating-osb-workloads.md b/_benchmark/user-guide/creating-osb-workloads.md new file mode 100644 index 00000000..76c57390 --- /dev/null +++ b/_benchmark/user-guide/creating-osb-workloads.md @@ -0,0 +1,390 @@ +--- +layout: default +title: Creating OpenSearch Benchmark workloads +nav_order: 10 +parent: User guide +redirect_from: + - /benchmark/creating-custom-workloads/ + - /benchmark/user-guide/creating-custom-workloads +--- + +# Creating custom workloads + +OpenSearch Benchmark (OSB) includes a set of [workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) that you can use to benchmark data from your cluster. Additionally, if you want to create a workload that is tailored to your own data, you can create a custom workload using one of the following options: + +- [Creating custom workloads](#creating-custom-workloads) + - [Creating a workload from an existing cluster](#creating-a-workload-from-an-existing-cluster) + - [Prerequisites](#prerequisites) + - [Customizing the workload](#customizing-the-workload) + - [Creating a workload without an existing cluster](#creating-a-workload-without-an-existing-cluster) + - [Invoking your custom workload](#invoking-your-custom-workload) + - [Advanced options](#advanced-options) + - [Test mode](#test-mode) + - [Adding variance to test procedures](#adding-variance-to-test-procedures) + - [Separate operations and test procedures](#separate-operations-and-test-procedures) + - [Next steps](#next-steps) + +## Creating a workload from an existing cluster + +If you already have an OpenSearch cluster with indexed data, use the following steps to create a custom workload for your cluster. + +### Prerequisites + +Before creating a custom OSB workload, make sure you have the following prerequisites in place: + +- An OpenSearch cluster with an index that contains 1000 or more documents. If your cluster's index does not contain at least 1000 documents, the workload can still run tests, however, you cannot run workloads using `--test-mode`. +- You must have the correct permissions to access your OpenSearch cluster. For more information about cluster permissions, see [Permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/). + +### Customizing the workload + +To begin creating a custom OSB workload, use the `opensearch-benchmark create-workload` command. + +``` +opensearch-benchmark create-workload \ +--workload="" \ +--target-hosts="" \ +--client-options="basic_auth_user:'',basic_auth_password:''" \ +--indices="" \ +--output-path="" +``` + +Replace the following options in the preceding example with information specific to your existing cluster: + +- `--workload`: A custom name for your custom workload. +- `--target-hosts:` A comma-separated list of host:port pairs from which the cluster extracts data. +- `--client-options`: The basic authentication client options that OpenSearch Benchmark uses to access the cluster. +- `--indices`: One or more indexes inside your OpenSearch cluster that contain data. +- `--output-path`: The directory in which OpenSearch Benchmark creates the workload and its configuration files. + +The following example response creates a workload named `movies` from a cluster with an index named `movies-info`. The `movies-info` index contains over 2,000 documents. + +``` + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + +[INFO] You did not provide an explicit timeout in the client options. Assuming default of 10 seconds. +[INFO] Connected to OpenSearch cluster [380d8fd64dd85b5f77c0ad81b0799e1e] version [1.1.0]. + +Extracting documents for index [movies] for test mode... 1000/1000 docs [100.0% done] +Extracting documents for index [movies]... 2000/2000 docs [100.0% done] + +[INFO] Workload movies has been created. Run it with: opensearch-benchmark --workload-path=/Users/hoangia/Desktop/workloads/movies + +------------------------------- +[INFO] SUCCESS (took 2 seconds) +------------------------------- +``` + +As part of workload creation, OSB generates the following files. You can access them in the directory specified by the `--output-path` option. + +- `workload.json`: Contains general workload specifications. +- `.json`: Contains mappings and settings for the extracted indexes. +- `-documents.json`: Contains the sources of every document from the extracted indexes. Any sources suffixed with `-1k` encompass only a fraction of the document corpus of the workload and are only used when running the workload in test mode. + +By default, OpenSearch Benchmark does not contain a reference to generate queries. Because you have the best understanding of your data, we recommend adding a query to `workload.json` that matches your index's specifications. Use the following `match_all` query as an example of a query added to your workload: + +```json +{ + "operation": { + "name": "query-match-all", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + "clients": 8, + "warmup-iterations": 1000, + "iterations": 1000, + "target-throughput": 100 + } +``` + +### Creating a workload without an existing cluster + +If you want to create a custom workload but do not have an existing OpenSearch cluster with indexed data, you can create the workload by building the workload source files directly. All you need is data that can be exported into a JSON format. + +To build a workload with source files, create a directory for your workload and perform the following steps: + +1. Build a `-documents.json` file that contains rows of documents that comprise the document corpora of the workload and houses all data to be ingested and queried into the cluster. The following example shows the first few rows of a `movies-documents.json` file that contains rows of documents about famous movies: + + ```json + # First few rows of movies-documents.json + {"title": "Back to the Future", "director": "Robert Zemeckis", "revenue": "$212,259,762 USD", "rating": "8.5 out of 10", "image_url": "https://imdb.com/images/32"} + {"title": "Avengers: Endgame", "director": "Anthony and Joe Russo", "revenue": "$2,800,000,000 USD", "rating": "8.4 out of 10", "image_url": "https://imdb.com/images/2"} + {"title": "The Grand Budapest Hotel", "director": "Wes Anderson", "revenue": "$173,000,000 USD", "rating": "8.1 out of 10", "image_url": "https://imdb.com/images/65"} + {"title": "The Godfather: Part II", "director": "Francis Ford Coppola", "revenue": "$48,000,000 USD", "rating": "9 out of 10", "image_url": "https://imdb.com/images/7"} + ``` + +2. In the same directory, build a `index.json` file. The workload uses this file as a reference for data mappings and index settings for the documents contained in `-documents.json`. The following example creates mappings and settings specific to the `movie-documents.json` data from the previous step: + + ```json + { + "settings": { + "index.number_of_replicas": 0 + }, + "mappings": { + "dynamic": "strict", + "properties": { + "title": { + "type": "text" + }, + "director": { + "type": "text" + }, + "revenue": { + "type": "text" + }, + "rating": { + "type": "text" + }, + "image_url": { + "type": "text" + } + } + } + } + ``` + +3. Next, build a `workload.json` file that provides a high-level overview of your workload and determines how your workload runs benchmark tests. The `workload.json` file contains the following sections: + + - `indices`: Defines the name of the index to be created in your OpenSearch cluster using the mappings from the workload's `index.json` file created in the previous step. + - `corpora`: Defines the corpora and the source file, including the: + - `document-count`: The number of documents in `-documents.json`. To get an accurate number of documents, run `wc -l -documents.json`. + - `uncompressed-bytes`: The number of bytes inside the index. To get an accurate number of bytes, run `stat -f %z -documents.json` on macOS or `stat -c %s -documents.json` on GNU/Linux. Alternatively, run `ls -lrt | grep -documents.json`. + - `schedule`: Defines the sequence of operations and available test procedures for the workload. + +The following example `workload.json` file provides the entry point for the `movies` workload. The `indices` section creates an index called `movies`. The corpora section refers to the source file created in step one, `movie-documents.json`, and provides the document count and the amount of uncompressed bytes. Lastly, the schedule section defines a few operations the workload performs when invoked, including: + +- Deleting any current index named `movies`. +- Creating an index named `movies` based on data from `movie-documents.json` and the mappings from `index.json`. +- Verifying that the cluster is in good health and can ingest the new index. +- Ingesting the data corpora from `workload.json` into the cluster. +- Querying the results. + + ```json + { + "version": 2, + "description": "Tutorial benchmark for OpenSearch Benchmark", + "indices": [ + { + "name": "movies", + "body": "index.json" + } + ], + "corpora": [ + { + "name": "movies", + "documents": [ + { + "source-file": "movies-documents.json", + "document-count": 11658903, # Fetch document count from command line + "uncompressed-bytes": 1544799789 # Fetch uncompressed bytes from command line + } + ] + } + ], + "schedule": [ + { + "operation": { + "operation-type": "delete-index" + } + }, + { + "operation": { + "operation-type": "create-index" + } + }, + { + "operation": { + "operation-type": "cluster-health", + "request-params": { + "wait_for_status": "green" + }, + "retry-until-success": true + } + }, + { + "operation": { + "operation-type": "bulk", + "bulk-size": 5000 + }, + "warmup-time-period": 120, + "clients": 8 + }, + { + "operation": { + "operation-type": "force-merge" + } + }, + { + "operation": { + "name": "query-match-all", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + "clients": 8, + "warmup-iterations": 1000, + "iterations": 1000, + "target-throughput": 100 + } + ] + } + ``` + +The corpora section refers to the source file created in step one, `movie-documents.json`, and provides the document count and the amount of uncompressed bytes. Lastly, the schedule section defines a few operations the workload performs when invoked, including: + +- Deleting any current index named `movies`. +- Creating an index named `movies` based on data from `movie-documents.json` and the mappings from `index.json`. + - Verifying that the cluster is in good health and can ingest the new index. + - Ingesting the data corpora from `workload.json` into the cluster. + - Querying the results. + + + +For all the workload files created, verify that the workload is functional by running a test. To verify the workload, run the following command, replacing `--workload-path` with a path to your workload directory: + +``` +opensearch-benchmark list workloads --workload-path= +``` + +## Invoking your custom workload + +Use the `opensearch-benchmark execute-test` command to invoke your new workload and run a benchmark test against your OpenSearch cluster, as shown in the following example. Replace `--workload-path` with the path to your custom workload, `--target-host` with the `host:port` pairs for your cluster, and `--client-options` with any authorization options required to access the cluster. + +``` +opensearch-benchmark execute_test \ +--pipeline="benchmark-only" \ +--workload-path="" \ +--target-host="" \ +--client-options="basic_auth_user:'',basic_auth_password:''" +``` + +Results from the test appear in the directory set by `--output-path` option in `workloads.json`. + +## Advanced options + +You can enhance your custom workload's functionality with the following advanced options. + +### Test mode + +If you want run the test in test mode to make sure your workload operates as intended, add the `--test-mode` option to the `execute-test` command. Test mode ingests only the first 1000 documents from each index provided and runs query operations against them. + +To use test mode, create a `-documents-1k.json` file that contains the first 1000 documents from `-documents.json` using the following command: + +``` +head -n 1000 -documents.json > -documents-1k.json +``` + +Then, run `opensearch-benchmark execute-test` with the option `--test-mode`. Test mode runs a quick version of the workload test. + +``` +opensearch-benchmark execute_test \ +--pipeline="benchmark-only" \ +--workload-path="" \ +--target-host="" \ +--client-options"basic_auth_user:'',basic_auth_password:''" \ +--test-mode +``` + +### Adding variance to test procedures + +After using your custom workload several times, you might want to use the same workload but perform the workload's operations in a different order. Instead of creating a new workload or reorganizing the procedures directly, you can provide test procedures to vary workload operations. + +To add variance to your workload operations, go to your `workload.json` file and replace the `schedule` section with a `test_procedures` array, as shown in the following example. Each item in the array contains the following: + +- `name`: The name of the test procedure. +- `default`: When set to `true`, OpenSearch Benchmark defaults to the test procedure specified as `default` in the workload if no other test procedures are specified. +- `schedule`: All the operations the test procedure will run. + + +```json +"test_procedures": [ + { + "name": "index-and-query", + "default": true, + "schedule": [ + { + "operation": { + "operation-type": "delete-index" + } + }, + { + "operation": { + "operation-type": "create-index" + } + }, + { + "operation": { + "operation-type": "cluster-health", + "request-params": { + "wait_for_status": "green" + }, + "retry-until-success": true + } + }, + { + "operation": { + "operation-type": "bulk", + "bulk-size": 5000 + }, + "warmup-time-period": 120, + "clients": 8 + }, + { + "operation": { + "operation-type": "force-merge" + } + }, + { + "operation": { + "name": "query-match-all", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + "clients": 8, + "warmup-iterations": 1000, + "iterations": 1000, + "target-throughput": 100 + } + ] + } + ] +} +``` + +### Separate operations and test procedures + +If you want to make your `workload.json` file more readable, you can separate your operations and test procedures into different directories and reference the path to each in `workload.json`. To separate operations and procedures, perform the following steps: + +1. Add all test procedures to a single file. You can give the file any name. Because the `movies` workload in the preceding contains and index task and queries, this step names the test procedures file `index-and-query.json`. +2. Add all operations to a file named `operations.json`. +3. Reference the new files in `workloads.json` by adding the following syntax, replacing `parts` with the relative path to each file, as shown in the following example: + + ```json + "operations": [ + {% raw %}{{ benchmark.collect(parts="operations/*.json") }}{% endraw %} + ] + # Reference test procedure files in workload.json + "test_procedures": [ + {% raw %}{{ benchmark.collect(parts="test_procedures/*.json") }}{% endraw %} + ] + ``` + +## Next steps + +- For more information about configuring OpenSearch Benchmark, see [Configuring OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/configuring-benchmark/). +- To show a list of prepackaged workloads for OpenSearch Benchmark, see the [opensearch-benchmark-workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) repository. diff --git a/_benchmark/user-guide/distributed-load.md b/_benchmark/user-guide/distributed-load.md new file mode 100644 index 00000000..60fc9850 --- /dev/null +++ b/_benchmark/user-guide/distributed-load.md @@ -0,0 +1,71 @@ +--- +layout: default +title: Running distributed loads +nav_order: 15 +parent: User guide +--- + +# Running distributed loads + + +OpenSearch Benchmark loads always run on the same machine on which the benchmark was started. However, you can use multiple load drivers to generate additional benchmark testing loads, particularly for large clusters on multiple machines. This tutorial describes how to distribute benchmark loads across multiple machines in a single cluster. + +## System architecture + +The following tutorial uses a three-node architecture; each node is generated in [Amazon Elastic Compute Cloud (Amazon EC2)](https://docs.aws.amazon.com/ec2/?nc2=h_ql_doc_ec2): + +- **Node 1**: Node 1 acts as the _coordinator node_ and enables distribution and communication between the other two nodes. +- **Node 2** and **Node 3**: The remaining nodes in the cluster are used to generate the load for the benchmark test. + +OpenSearch Benchmark must be installed on all nodes. For installation instructions, see [Installing OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/user-guide/installing-benchmark/). + +Make note of each node's IP address. This tutorial uses the following IP addresses: + +- **Node 1 -- Coordinator node**: 192.0.1.0 +- **Node 2 -- Worker node**: 198.52.100.0 +- **Node 3 -- Worker node**: 198.53.100.0 + +## Step 1: Enable node communication + +Make sure to enable communication for each node. In the AWS Management Console: + +1. Go to the EC2 host for the node. +2. Select **Security**, and then select the security group associated with the node. +3. Use **Add inbound rules** to open traffic to the node, based on the port range and traffic type of your cluster. + +## Step 2: Run daemon processes on each node + +Start OpenSearch Benchmark on each node, using `--node-ip` to initialize OpenSearch Benchmark on the node itself and then `--coordinator-ip` to connect each node to the coordinator node. + +For **Node 1**, the following command identifies the node as the coordinator node: + +``` +opensearch-benchmarkd start --node-ip=192.0.1.0 --coordinator-ip=192.0.1.0 +``` + +The following commands enable **Node 2** and **Node 3** to listen to the coordinator node for load generation instructions. + +**Node 2** + +``` +opensearch-benchmarkd start --node-ip=198.52.100.0 --coordinator-ip=192.0.1.0 +``` + +**Node 3** + +``` +opensearch-benchmarkd start --node-ip=198.53.100.0 --coordinator-ip=192.0.1.0 +``` + +With OpenSearch Benchmark running on all three nodes and the worker nodes set to listen to the coordinator node, you can now run the benchmark test. + +## Step 3: Run the benchmark test + +On **Node 1**, run a benchmark test with the `worker-ips` set to the IP addresses for your worker nodes, as shown in the following example: + +``` +opensearch-benchmark execute_test --pipeline=benchmark-only --workload=eventdata --worker-ips=198.52.100.0,198.53.100.0 --target-hosts= --client-options= --kill-running-processes +``` + +After the test completes, the logs generated by the test appear on your worker nodes. + diff --git a/_benchmark/user-guide/index.md b/_benchmark/user-guide/index.md new file mode 100644 index 00000000..35828552 --- /dev/null +++ b/_benchmark/user-guide/index.md @@ -0,0 +1,10 @@ +--- +layout: default +title: User guide +nav_order: 5 +has_children: true +--- + +# OpenSearch Benchmark User Guide + +The OpenSearch Benchmark User Guide includes core [concepts]({{site.url}}{{site.baseurl}}/benchmark/user-guide/concepts/), [installation]({{site.url}}{{site.baseurl}}/benchmark/installing-benchmark/) instructions, and [configuration options]({{site.url}}{{site.baseurl}}/benchmark/configuring-benchmark/) to help you get the most out of OpenSearch Benchmark. \ No newline at end of file diff --git a/_benchmark/user-guide/installing-benchmark.md b/_benchmark/user-guide/installing-benchmark.md new file mode 100644 index 00000000..d3387dee --- /dev/null +++ b/_benchmark/user-guide/installing-benchmark.md @@ -0,0 +1,210 @@ +--- +layout: default +title: Installing OpenSearch Benchmark +nav_order: 5 +parent: User guide +redirect_from: /benchmark/installing-benchmark/ +--- + +# Installing OpenSearch Benchmark + +You can install OpenSearch Benchmark directly on a host running Linux or macOS, or you can run OpenSearch Benchmark in a Docker container on any compatible host. This page provides general considerations for your OpenSearch Benchmark host as well as instructions for installing OpenSearch Benchmark. + + +## Choosing appropriate hardware + +OpenSearch Benchmark can be used to provision OpenSearch nodes for testing. If you intend to use OpenSearch Benchmark to provision nodes in your environment, then install OpenSearch Benchmark directly on each host in the cluster. Additionally, you must configure each host in the cluster for OpenSearch. See [Installing OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/) for guidance on important host settings. + +Remember that OpenSearch Benchmark cannot be used to provision OpenSearch nodes when you run OpenSearch Benchmark in a Docker container. If you want to use OpenSearch Benchmark to provision nodes, or if you want to distribute the benchmark workload with the OpenSearch Benchmark daemon, then you must install OpenSearch Benchmark directly on each host using Python and pip. +{: .important} + +When you select a host, you should also think about which workloads you want to run. To see a list of default benchmark workloads, visit the [opensearch-benchmark-workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) repository on GitHub. As a general rule, make sure that the OpenSearch Benchmark host has enough free storage space to store the compressed data and the fully decompressed data corpus once OpenSearch Benchmark is installed. + +If you want to benchmark with a default workload, then use the following table to determine the approximate minimum amount of required free space needed by adding the compressed size with the uncompressed size. + +| Workload name | Document count | Compressed size | Uncompressed size | +| :----: | :----: | :----: | :----: | +| eventdata | 20,000,000 | 756.0 MB | 15.3 GB | +| geonames | 11,396,503 | 252.9 MB | 3.3 GB | +| geopoint | 60,844,404 | 482.1 MB | 2.3 GB | +| geopointshape | 60,844,404 | 470.8 MB | 2.6 GB | +| geoshape | 60,523,283 | 13.4 GB | 45.4 GB | +| http_logs | 247,249,096 | 1.2 GB | 31.1 GB | +| nested | 11,203,029 | 663.3 MB | 3.4 GB | +| noaa | 33,659,481 | 949.4 MB | 9.0 GB | +| nyc_taxis | 165,346,692 | 4.5 GB | 74.3 GB | +| percolator | 2,000,000 | 121.1 kB | 104.9 MB | +| pmc | 574,199 | 5.5 GB | 21.7 GB | +| so | 36,062,278 | 8.9 GB | 33.1 GB | + +Your OpenSearch Benchmark host should use solid-state drives (SSDs) for storage because they perform read and write operations significantly faster than traditional spinning-disk hard drives. Spinning-disk hard drives can introduce performance bottlenecks, which can make benchmark results unreliable and inconsistent. +{: .tip} + +## Installing on Linux and macOS + +If you want to run OpenSearch Benchmark in a Docker container, see [Installing with Docker](#installing-with-docker). The OpenSearch Benchmark Docker image includes all of the required software, so there are no additional steps required. +{: .important} + +To install OpenSearch Benchmark directly on a UNIX host, such as Linux or macOS, make sure you have **Python 3.8 or later** installed. + +If you need help installing Python, refer to the official [Python Setup and Usage](https://docs.python.org/3/using/index.html) documentation. + +### Checking software dependencies + +Before you begin installing OpenSearch Benchmark, check the following software dependencies. + +Use [pyenv](https://github.com/pyenv/pyenv) to manage multiple versions of Python on your host. This is especially useful if your "system" version of Python is earlier than version 3.8. +{: .tip} + +- Check that Python 3.8 or later is installed: + + ```bash + python3 --version + ``` + {% include copy.html %} + +- Check that `pip` is installed and functional: + + ```bash + pip --version + ``` + {% include copy.html %} + +- _Optional_: Check that your installed version of `git` is **Git 1.9 or later** using the following command. `git` is not required for OpenSearch Benchmark installation, but it is required in order to fetch benchmark workload resources from a repository when you want to perform tests. See the official Git [Documentation](https://git-scm.com/doc) for help installing Git. + + ```bash + git --version + ``` + {% include copy.html %} + +### Completing the installation + +After the required software is installed, you can install OpenSearch Benchmark using the following command: + +```bash +pip install opensearch-benchmark +``` +{% include copy.html %} + +After the installation completes, you can use the following command to display help information: + +```bash +opensearch-benchmark -h +``` +{% include copy.html %} + + +Now that OpenSearch Benchmark is installed on your host, you can learn about [Configuring OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/configuring-benchmark/). + +## Installing with Docker + +You can find the official Docker images for OpenSearch Benchmark on [Docker Hub](https://hub.docker.com/r/opensearchproject/opensearch-benchmark) or on the [Amazon ECR Public Gallery](https://gallery.ecr.aws/opensearchproject/opensearch-benchmark). + + +### Docker limitations + +Some OpenSearch Benchmark functionality is unavailable when you run OpenSearch Benchmark in a Docker container. Specifically, the following restrictions apply: + +- OpenSearch Benchmark cannot distribute load from multiple hosts, such as load worker coordinator hosts. +- OpenSearch Benchmark cannot provision OpenSearch nodes and can only run tests on previously existing clusters. You can only invoke OpenSearch Benchmark commands using the `benchmark-only` pipeline. + +### Pulling the Docker images + +To pull the image from Docker Hub, run the following command: + +```bash +docker pull opensearchproject/opensearch-benchmark:latest +``` +{% include copy.html %} + +To pull the image from Amazon Elastic Container Registry (Amazon ECR): + +```bash +docker pull public.ecr.aws/opensearchproject/opensearch-benchmark:latest +``` +{% include copy.html %} + +### Running OpenSearch Benchmark with Docker + +To run OpenSearch Benchmark, use `docker run` to launch a container. OpenSearch Benchmark subcommands are passed as arguments when you start the container. OpenSearch Benchmark then processes the command and stops the container after the requested operation completes. + +For example, the following command prints the help text for OpenSearch Benchmark to the command line and then stops the container: + +```bash +docker run opensearchproject/opensearch-benchmark -h +``` +{% include copy.html %} + + +### Establishing volume persistence in a Docker container + +To make sure your benchmark data and logs persist after your Docker container stops, specify a Docker volume to mount to the image when you work with OpenSearch Benchmark. + +Use the `-v` option to specify a local directory to mount and a directory in the container where the volume is attached. + +The following example command creates a volume in a user's home directory, mounts the volume to the OpenSearch Benchmark container at `/opensearch-benchmark/.benchmark`, and then runs a test benchmark using the geonames workload. Some client options are also specified: + +```bash +docker run -v $HOME/benchmarks:/opensearch-benchmark/.benchmark opensearchproject/opensearch-benchmark execute-test --target-hosts https://198.51.100.25:9200 --pipeline benchmark-only --workload geonames --client-options basic_auth_user:admin,basic_auth_password:admin,verify_certs:false --test-mode +``` +{% include copy.html %} + +See [Configuring OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/configuring-benchmark/) to learn more about the files and subdirectories located in `/opensearch-benchmark/.benchmark`. + +## Provisioning an OpenSearch cluster with a test + +OpenSearch Benchmark is compatible with JDK versions 17, 16, 15, 14, 13, 12, 11, and 8. +{: .note} + +If you installed OpenSearch with PyPi, you can also provision a new OpenSearch cluster by specifying a `distribution-version` in the `execute-test` command. + +If you plan on having Benchmark provision a cluster, you'll need to inform Benchmark of the location of the `JAVA_HOME` path for the Benchmark cluster. To set the `JAVA_HOME` path and provision a cluster: + +1. Find the `JAVA_HOME` path you're currently using. Open a terminal and enter `/usr/libexec/java_home`. + +2. Set your corresponding JDK version environment variable by entering the path from the previous step. Enter `export JAVA17_HOME=`. + +3. Run the `execute-test` command and indicate the distribution version of OpenSearch you want to use: + + ```bash + opensearch-benchmark execute-test --distribution-version=2.3.0 --workload=geonames --test-mode + ``` + +## Directory structure + +After running OpenSearch Benchmark for the first time, you can search through all related files, including configuration files, in the `~/.benchmark` directory. The directory includes the following file tree: + +``` +# ~/.benchmark Tree +. +├── benchmark.ini +├── benchmarks +│ ├── data +│ │ └── geonames +│ ├── distributions +│ │ ├── opensearch-1.0.0-linux-x64.tar.gz +│ │ └── opensearch-2.3.0-linux-x64.tar.gz +│ ├── test_executions +│ │ ├── 0279b13b-1e54-49c7-b1a7-cde0b303a797 +│ │ └── 0279c542-a856-4e88-9cc8-04306378cd38 +│ └── workloads +│ └── default +│ └── geonames +├── logging.json +├── logs +│ └── benchmark.log +``` + +* `benchmark.ini`: Contains any adjustable configurations for tests. For information about how to configure OpenSearch Benchmark, see [Configuring OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/configuring-benchmark/). +* `data`: Contains all the data corpora and documents related to OpenSearch Benchmark's [official workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/geonames). +* `distributions`: Contains all the OpenSearch distributions downloaded from [OpenSearch.org](http://opensearch.org/) and used to provision clusters. +* `test_executions`: Contains all the test `execution_id`s from previous runs of OpenSearch Benchmark. +* `workloads`: Contains all files related to workloads, except for the data corpora. +* `logging.json`: Contains all of the configuration options related to how logging is performed within OpenSearch Benchmark. +* `logs`: Contains all the logs from OpenSearch Benchmark runs. This can be helpful when you've encountered errors during runs. + + +## Next steps + +- [Configuring OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/configuring-benchmark/) +- [Creating custom workloads]({{site.url}}{{site.baseurl}}/benchmark/creating-custom-workloads/) diff --git a/_benchmark/user-guide/running-workloads.md b/_benchmark/user-guide/running-workloads.md new file mode 100644 index 00000000..36108eb9 --- /dev/null +++ b/_benchmark/user-guide/running-workloads.md @@ -0,0 +1,168 @@ +--- +layout: default +title: Running a workload +nav_order: 9 +parent: User guide +--- + +# Running a workload + +Once you have a complete understanding of the various components of an OpenSearch Benchmark [workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/anatomy-of-a-workload/), you can run your first workload. + +## Step 1: Find the workload name + +To learn more about the standard workloads included with OpenSearch Benchmark, use the following command: + +``` +opensearch-benchmark list workloads +``` +{% include copy.html %} + +A list of all workloads supported by OpenSearch Benchmark appears. Review the list and select the workload that's most similar to your cluster's use case. + +## Step 2: Running the test + +After you've selected the workload, you can invoke the workload using the `opensearch-benchmark execute-test` command. Replace `--target-host` with the `host:port` pairs for your cluster and `--client-options` with any authorization options required to access the cluster. The following example runs the `nyc_taxis` workload on a localhost for testing purposes. + +If you want to run a test on an external cluster, see [Running the workload on your own cluster](#running-a-workload-on-an-external-cluster). + +```bash +opensearch-benchmark execute-test --pipeline=benchmark-only --workload=nyc_taxis --target-host=https://localhost:9200 --client-options=basic_auth_user:admin,basic_auth_password:admin,verify_certs:false +``` +{% include copy.html %} + + +Results from the test appear in the directory set by the `--output-path` option in the `execute-test` command. + +### Test mode + +If you want to run the test in test mode to make sure that your workload operates as intended, add the `--test-mode` option to the `execute-test` command. Test mode ingests only the first 1,000 documents from each index provided and runs query operations against them. + +## Step 3: Validate the test + +After running an OpenSearch Benchmark test, take the following steps to verify that it has run properly: + +1. Note the number of documents in the OpenSearch or OpenSearch Dashboards index that you plan to run the benchmark against. +2. In the results returned by OpenSearch Benchmark, compare the `workload.json` file for your specific workload and verify that the document count matches the number of documents. For example, based on the [nyc_taxis](https://github.com/opensearch-project/opensearch-benchmark-workloads/blob/main/nyc_taxis/workload.json#L20) `workload.json` file, you should expect to see `165346692` documents in your cluster. + +## Expected results + +OSB returns the following response once the benchmark completes: + +```bash +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +------------------------------------------------------ + +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|-------------------------------------------:|------------:|-------:| +| Cumulative indexing time of primary shards | | 0.02655 | min | +| Min cumulative indexing time across primary shards | | 0 | min | +| Median cumulative indexing time across primary shards | | 0.00176667 | min | +| Max cumulative indexing time across primary shards | | 0.0140333 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 0.0102333 | min | +| Cumulative merge count of primary shards | | 3 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0 | min | +| Max cumulative merge time across primary shards | | 0.0102333 | min | +| Cumulative merge throttle time of primary shards | | 0 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0 | min | +| Cumulative refresh time of primary shards | | 0.0709333 | min | +| Cumulative refresh count of primary shards | | 118 | | +| Min cumulative refresh time across primary shards | | 0 | min | +| Median cumulative refresh time across primary shards | | 0.00186667 | min | +| Max cumulative refresh time across primary shards | | 0.0511667 | min | +| Cumulative flush time of primary shards | | 0.00963333 | min | +| Cumulative flush count of primary shards | | 4 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 0 | min | +| Max cumulative flush time across primary shards | | 0.00398333 | min | +| Total Young Gen GC time | | 0 | s | +| Total Young Gen GC count | | 0 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 0.000485923 | GB | +| Translog size | | 2.01873e-05 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 32 | | +| Min Throughput | index | 3008.97 | docs/s | +| Mean Throughput | index | 3008.97 | docs/s | +| Median Throughput | index | 3008.97 | docs/s | +| Max Throughput | index | 3008.97 | docs/s | +| 50th percentile latency | index | 351.059 | ms | +| 100th percentile latency | index | 365.058 | ms | +| 50th percentile service time | index | 351.059 | ms | +| 100th percentile service time | index | 365.058 | ms | +| error rate | index | 0 | % | +| Min Throughput | wait-until-merges-finish | 28.41 | ops/s | +| Mean Throughput | wait-until-merges-finish | 28.41 | ops/s | +| Median Throughput | wait-until-merges-finish | 28.41 | ops/s | +| Max Throughput | wait-until-merges-finish | 28.41 | ops/s | +| 100th percentile latency | wait-until-merges-finish | 34.7088 | ms | +| 100th percentile service time | wait-until-merges-finish | 34.7088 | ms | +| error rate | wait-until-merges-finish | 0 | % | +| Min Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| Mean Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| Median Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| Max Throughput | percolator_with_content_president_bush | 36.09 | ops/s | +| 100th percentile latency | percolator_with_content_president_bush | 35.9822 | ms | +| 100th percentile service time | percolator_with_content_president_bush | 7.93048 | ms | +| error rate | percolator_with_content_president_bush | 0 | % | + +[...] + +| Min Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| Mean Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| Median Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| Max Throughput | percolator_with_content_ignore_me | 16.1 | ops/s | +| 100th percentile latency | percolator_with_content_ignore_me | 131.798 | ms | +| 100th percentile service time | percolator_with_content_ignore_me | 69.5237 | ms | +| error rate | percolator_with_content_ignore_me | 0 | % | +| Min Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| Mean Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| Median Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| Max Throughput | percolator_no_score_with_content_ignore_me | 29.37 | ops/s | +| 100th percentile latency | percolator_no_score_with_content_ignore_me | 45.5703 | ms | +| 100th percentile service time | percolator_no_score_with_content_ignore_me | 11.316 | ms | +| error rate | percolator_no_score_with_content_ignore_me | 0 | % | + + + +-------------------------------- +[INFO] SUCCESS (took 18 seconds) +-------------------------------- +``` + + + +## Running a workload on an external cluster + +Now that you're familiar with running OpenSearch Benchmark on a local cluster, you can run it on your external cluster, as described in the following steps: + +1. Replace `https://localhost:9200` with your target cluster endpoint. This could be a Uniform Resource Identifier (URI), such as `https://search.mydomain.com`, or a `HOST:PORT` specification. +2. If the cluster is configured with basic authentication, replace the username and password in the command line with the appropriate credentials. +3. Remove the `verify_certs:false` directive if you are not specifying `localhost` as your target cluster. This directive is necessary solely for clusters without SSL certificates. +4. If you are using a `HOST:PORT`specification and plan to use SSL or TLS, either specify `https://` or add the `use_ssl:true` directive to the `--client-options` string option. +5. Remove the `--test-mode` flag to run the full workload rather than an abbreviated test. + +You can copy the following command template to use it in your own terminal: + +```bash +opensearch-benchmark execute-test --pipeline=benchmark-only --workload=nyc_taxis --target-host= --client-options=basic_auth_user:admin,basic_auth_password:admin +``` +{% include copy.html %} diff --git a/_benchmark/user-guide/telemetry.md b/_benchmark/user-guide/telemetry.md new file mode 100644 index 00000000..d4c40c79 --- /dev/null +++ b/_benchmark/user-guide/telemetry.md @@ -0,0 +1,8 @@ +--- +layout: default +title: Enabling telemetry devices +nav_order: 30 +parent: User guide +--- + +Telemetry results will not appear in the summary report. To visualize telemetry results, ingest the data into OpenSearch and visualize the data in OpenSearch Dashboards. \ No newline at end of file diff --git a/_benchmark/user-guide/understanding-workloads/anatomy-of-a-workload.md b/_benchmark/user-guide/understanding-workloads/anatomy-of-a-workload.md new file mode 100644 index 00000000..b5493247 --- /dev/null +++ b/_benchmark/user-guide/understanding-workloads/anatomy-of-a-workload.md @@ -0,0 +1,790 @@ +--- +layout: default +title: Anatomy of a workload +nav_order: 15 +grand_parent: User guide +parent: Understanding workloads +--- + +# Anatomy of a workload + +All workloads contain the following files and directories: + +- [workload.json](#workloadjson): Contains all of the workload settings. +- [index.json](#indexjson): Contains the document mappings and parameters as well as index settings. +- [files.txt](#filestxt): Contains the data corpora file names. +- [_test-procedures](#_operations-and-_test-procedures): Most workloads contain only one default test procedure, which is configured in `default.json`. +- [_operations](#_operations-and-_test-procedures): Contains all of the operations used in test procedures. +- workload.py: Adds more dynamic functionality to the test. + +## workload.json + +The following example workload shows all of the essential elements needed to create a `workload.json` file. You can run this workload in your own benchmark configuration to understand how all of the elements work together: + +```json +{ + "description": "Tutorial benchmark for OpenSearch Benchmark", + "indices": [ + { + "name": "movies", + "body": "index.json" + } + ], + "corpora": [ + { + "name": "movies", + "documents": [ + { + "source-file": "movies-documents.json", + "document-count": 11658903, # Fetch document count from command line + "uncompressed-bytes": 1544799789 # Fetch uncompressed bytes from command line + } + ] + } + ], + "schedule": [ + { + "operation": { + "operation-type": "create-index" + } + }, + { + "operation": { + "operation-type": "cluster-health", + "request-params": { + "wait_for_status": "green" + }, + "retry-until-success": true + } + }, + { + "operation": { + "operation-type": "bulk", + "bulk-size": 5000 + }, + "warmup-time-period": 120, + "clients": 8 + }, + { + "operation": { + "name": "query-match-all", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + "iterations": 1000, + "target-throughput": 100 + } + ] +} +``` + +A workload usually includes the following elements: + +- [indices]({{site.url}}{{site.baseurl}}/benchmark/workloads/indices/): Defines the relevant indexes and index templates used for the workload. +- [corpora]({{site.url}}{{site.baseurl}}/benchmark/workloads/corpora/): Defines all document corpora used for the workload. +- `schedule`: Defines operations and the order in which the operations run inline. Alternatively, you can use `operations` to group operations and the `test_procedures` parameter to specify the order of operations. +- `operations`: **Optional**. Describes which operations are available for the workload and how they are parameterized. + +### Indices + +To create an index, specify its `name`. To add definitions to your index, use the `body` option and point it to the JSON file containing the index definitions. For more information, see [Indices]({{site.url}}{{site.baseurl}}/benchmark/workloads/indices/). + +### Corpora + +The `corpora` element requires the name of the index containing the document corpus, for example, `movies`, and a list of parameters that define the document corpora. This list includes the following parameters: + +- `source-file`: The file name that contains the workload's corresponding documents. When using OpenSearch Benchmark locally, documents are contained in a JSON file. When providing a `base_url`, use a compressed file format: `.zip`, `.bz2`, `.zst`, `.gz`, `.tar`, `.tar.gz`, `.tgz`, or `.tar.bz2`. The compressed file must include one JSON file containing the name. +- `document-count`: The number of documents in the `source-file`, which determines which client indexes correlate to which parts of the document corpus. Each N client is assigned an Nth of the document corpus to ingest into the test cluster. When using a source that contains a document with a parent-child relationship, specify the number of parent documents. +- `uncompressed-bytes`: The size, in bytes, of the source file after decompression, indicating how much disk space the decompressed source file needs. +- `compressed-bytes`: The size, in bytes, of the source file before decompression. This can help you assess the amount of time needed for the cluster to ingest documents. + +### Operations + +The `operations` element lists the OpenSearch API operations performed by the workload. For example, you can list an operation named `create-index` that creates an index in the benchmark cluster to which OpenSearch Benchmark can write documents. Operations are usually listed inside of the `schedule` element. + +### Schedule + +The `schedule` element contains a list of operations that are run in a specified order, as shown in the following JSON example: + +```json + "schedule": [ + { + "operation": { + "operation-type": "create-index" + } + }, + { + "operation": { + "operation-type": "cluster-health", + "request-params": { + "wait_for_status": "green" + }, + "retry-until-success": true + } + }, + { + "operation": { + "operation-type": "bulk", + "bulk-size": 5000 + }, + "warmup-time-period": 120, + "clients": 8 + }, + { + "operation": { + "name": "query-match-all", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + "iterations": 1000, + "target-throughput": 100 + } + ] +} +``` + +According to this `schedule`, the actions will run in the following order: + +1. The `create-index` operation creates an index. The index remains empty until the `bulk` operation adds documents with benchmarked data. +2. The `cluster-health` operation assesses the cluster's health before running the workload. In the JSON example, the workload waits until the cluster's health status is `green`. + - The `bulk` operation runs the `bulk` API to index `5000` documents simultaneously. + - Before benchmarking, the workload waits until the specified `warmup-time-period` passes. In the JSON example, the warmup period is `120` seconds. +3. The `clients` field defines the number of clients, in this example, eight, that will run the bulk indexing operation concurrently. +4. The `search` operation runs a `match_all` query to match all documents after they have been indexed by the `bulk` API using the specified clients. + - The `iterations` field defines the number of times each client runs the `search` operation. The benchmark report automatically adjusts the percentile numbers based on this number. To generate a precise percentile, the benchmark needs to run at least 1,000 iterations. + - The `target-throughput` field defines the number of requests per second that each client performs. When set, the setting can help reduce benchmark latency. For example, a `target-throughput` of 100 requests divided by 8 clients means that each client will issue 12 requests per second. For more information about how target throughput is defined in OpenSearch Benchmark, see [Throughput and latency](https://opensearch.org/docs/latest/benchmark/user-guide/concepts/#throughput-and-latency). + +## index.json + +The `index.json` file defines the data mappings, indexing parameters, and index settings for workload documents during `create-index` operations. + +When OpenSearch Benchmark creates an index for the workload, it uses the index settings and mappings template in the `index.json` file. Mappings in the `index.json` file are based on the mappings of a single document from the workload's corpus, which is stored in the `files.txt` file. The following is an example of the `index.json` file for the `nyc_taxis` workload. You can customize the fields, such as `number_of_shards`, `number_of_replicas`, `query_cache_enabled`, and `requests_cache_enabled`. + +```json +{ + "settings": { + "index.number_of_shards": {% raw %}{{number_of_shards | default(1)}}{% endraw %}, + "index.number_of_replicas": {% raw %}{{number_of_replicas | default(0)}}{% endraw %}, + "index.queries.cache.enabled": {% raw %}{{query_cache_enabled | default(false) | tojson}}{% endraw %}, + "index.requests.cache.enable": {% raw %}{{requests_cache_enabled | default(false) | tojson}}{% endraw %} + }, + "mappings": { + "_source": { + "enabled": {% raw %}{{ source_enabled | default(true) | tojson }}{% endraw %} + }, + "properties": { + "surcharge": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "dropoff_datetime": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss" + }, + "trip_type": { + "type": "keyword" + }, + "mta_tax": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "rate_code_id": { + "type": "keyword" + }, + "passenger_count": { + "type": "integer" + }, + "pickup_datetime": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss" + }, + "tolls_amount": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "tip_amount": { + "type": "half_float" + }, + "payment_type": { + "type": "keyword" + }, + "extra": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "vendor_id": { + "type": "keyword" + }, + "store_and_fwd_flag": { + "type": "keyword" + }, + "improvement_surcharge": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "fare_amount": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "ehail_fee": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "cab_color": { + "type": "keyword" + }, + "dropoff_location": { + "type": "geo_point" + }, + "vendor_name": { + "type": "text" + }, + "total_amount": { + "scaling_factor": 100, + "type": "scaled_float" + }, + "trip_distance": {% raw %}{%- if trip_distance_mapping is defined %} {{ trip_distance_mapping | tojson }} {%- else %}{% endraw %} { + "scaling_factor": 100, + "type": "scaled_float" + }{% raw %}{%- endif %}{% endraw %}, + "pickup_location": { + "type": "geo_point" + } + }, + "dynamic": "strict" + } +} +``` + +## files.txt + +The `files.txt` file lists the files that store the workload data, which are typically stored in a zipped JSON file. + +## _operations and _test-procedures + +To make the workload more human-readable, `_operations` and `_test-procedures` are separated into two directories. + +The `_operations` directory contains a `default.json` file that lists all of the supported operations that the test procedure can use. Some workloads, such as `nyc_taxis`, contain an additional `.json` file that lists feature-specific operations, such as `snapshot` operations. The following JSON example shows a list of operations from the `nyc_taxis` workload: + +```json + { + "name": "index", + "operation-type": "bulk", + "bulk-size": {% raw %}{{bulk_size | default(10000)}}{% endraw %}, + "ingest-percentage": {% raw %}{{ingest_percentage | default(100)}}{% endraw %} + }, + { + "name": "update", + "operation-type": "bulk", + "bulk-size": {% raw %}{{bulk_size | default(10000)}}, + "ingest-percentage": {{ingest_percentage | default(100)}}, + "conflicts": "{{conflicts | default('random')}}", + "on-conflict": "{{on_conflict | default('update')}}", + "conflict-probability": {{conflict_probability | default(25)}}, + "recency": {{recency | default(0)}}{% endraw %} + }, + { + "name": "wait-until-merges-finish", + "operation-type": "index-stats", + "index": "_all", + "condition": { + "path": "_all.total.merges.current", + "expected-value": 0 + }, + "retry-until-success": true, + "include-in-reporting": false + }, + { + "name": "default", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + { + "name": "range", + "operation-type": "search", + "body": { + "query": { + "range": { + "total_amount": { + "gte": 5, + "lt": 15 + } + } + } + } + }, + { + "name": "distance_amount_agg", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "bool": { + "filter": { + "range": { + "trip_distance": { + "lt": 50, + "gte": 0 + } + } + } + } + }, + "aggs": { + "distance_histo": { + "histogram": { + "field": "trip_distance", + "interval": 1 + }, + "aggs": { + "total_amount_stats": { + "stats": { + "field": "total_amount" + } + } + } + } + } + } + }, + { + "name": "autohisto_agg", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "01/01/2015", + "lte": "21/01/2015", + "format": "dd/MM/yyyy" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "auto_date_histogram": { + "field": "dropoff_datetime", + "buckets": 20 + } + } + } + } + }, + { + "name": "date_histogram_agg", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "01/01/2015", + "lte": "21/01/2015", + "format": "dd/MM/yyyy" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "calendar_interval": "day" + } + } + } + } + }, + { + "name": "date_histogram_calendar_interval", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "calendar_interval": "month" + } + } + } + } + }, + { + "name": "date_histogram_calendar_interval_with_tz", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "calendar_interval": "month", + "time_zone": "America/New_York" + } + } + } + } + }, + { + "name": "date_histogram_fixed_interval", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "fixed_interval": "60d" + } + } + } + } + }, + { + "name": "date_histogram_fixed_interval_with_tz", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "fixed_interval": "60d", + "time_zone": "America/New_York" + } + } + } + } + }, + { + "name": "date_histogram_fixed_interval_with_metrics", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "date_histogram": { + "field": "dropoff_datetime", + "fixed_interval": "60d" + }, + "aggs": { + "total_amount": { "stats": { "field": "total_amount" } }, + "tip_amount": { "stats": { "field": "tip_amount" } }, + "trip_distance": { "stats": { "field": "trip_distance" } } + } + } + } + } + }, + { + "name": "auto_date_histogram", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "auto_date_histogram": { + "field": "dropoff_datetime", + "buckets": "12" + } + } + } + } + }, + { + "name": "auto_date_histogram_with_tz", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "auto_date_histogram": { + "field": "dropoff_datetime", + "buckets": "13", + "time_zone": "America/New_York" + } + } + } + } + }, + { + "name": "auto_date_histogram_with_metrics", + "operation-type": "search", + "body": { + "size": 0, + "query": { + "range": { + "dropoff_datetime": { + "gte": "2015-01-01 00:00:00", + "lt": "2016-01-01 00:00:00" + } + } + }, + "aggs": { + "dropoffs_over_time": { + "auto_date_histogram": { + "field": "dropoff_datetime", + "buckets": "12" + }, + "aggs": { + "total_amount": { "stats": { "field": "total_amount" } }, + "tip_amount": { "stats": { "field": "tip_amount" } }, + "trip_distance": { "stats": { "field": "trip_distance" } } + } + } + } + } + }, + { + "name": "desc_sort_tip_amount", + "operation-type": "search", + "index": "nyc_taxis", + "body": { + "query": { + "match_all": {} + }, + "sort" : [ + {"tip_amount" : "desc"} + ] + } + }, + { + "name": "asc_sort_tip_amount", + "operation-type": "search", + "index": "nyc_taxis", + "body": { + "query": { + "match_all": {} + }, + "sort" : [ + {"tip_amount" : "asc"} + ] + } + } +``` + +The `_test-procedures` directory contains a `default.json` file that sets the order of operations performed by the workload. Similar to the `_operations` directory, the `_test-procedures` directory can also contain feature-specific test procedures, such as `searchable_snapshots.json` for `nyc_taxis`. The following examples show the searchable snapshots test procedures for `nyc_taxis`: + +```json + { + "name": "searchable-snapshot", + "description": "Measuring performance for Searchable Snapshot feature. Based on the default test procedure 'append-no-conflicts'.", + "schedule": [ + { + "operation": "delete-index" + }, + { + "operation": { + "operation-type": "create-index", + "settings": {% raw %}{%- if index_settings is defined %} {{ index_settings | tojson }} {%- else %}{ + "index.codec": "best_compression", + "index.refresh_interval": "30s", + "index.translog.flush_threshold_size": "4g" + }{%- endif %}{% endraw %} + } + }, + { + "name": "check-cluster-health", + "operation": { + "operation-type": "cluster-health", + "index": "nyc_taxis", + "request-params": { + "wait_for_status": {% raw %}"{{ cluster_health | default('green') }}"{% endraw %}, + "wait_for_no_relocating_shards": "true" + }, + "retry-until-success": true + } + }, + { + "operation": "index", + "warmup-time-period": 240, + "clients": {% raw %}{{ bulk_indexing_clients | default(8) }}, + "ignore-response-error-level": "{{ error_level | default('non-fatal') }}"{% endraw %} + }, + { + "name": "refresh-after-index", + "operation": "refresh" + }, + { + "operation": { + "operation-type": "force-merge", + "request-timeout": 7200 + {% raw %}{%- if force_merge_max_num_segments is defined %}{% endraw %}, + "max-num-segments": {% raw %}{{ force_merge_max_num_segments | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + } + }, + { + "name": "refresh-after-force-merge", + "operation": "refresh" + }, + { + "operation": "wait-until-merges-finish" + }, + { + "operation": "create-snapshot-repository" + }, + { + "operation": "delete-snapshot" + }, + { + "operation": "create-snapshot" + }, + { + "operation": "wait-for-snapshot-creation" + }, + { + "operation": { + "name": "delete-local-index", + "operation-type": "delete-index" + } + }, + { + "operation": "restore-snapshot" + }, + { + "operation": "default", + "warmup-iterations": 50, + "iterations": 100 + {% raw %}{%- if not target_throughput %}{% endraw %} + ,"target-throughput": 3 + {% raw %}{%- elif target_throughput is string and target_throughput.lower() == 'none' %}{% endraw %} + {% raw %}{%- else %}{% endraw %} + ,"target-throughput": {% raw %}{{ target_throughput | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + {% raw %}{%-if search_clients is defined and search_clients %}{% endraw %} + ,"clients": {% raw %}{{ search_clients | tojson}}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + }, + { + "operation": "range", + "warmup-iterations": 50, + "iterations": 100 + {% raw %}{%- if not target_throughput %}{% endraw %} + ,"target-throughput": 0.7 + {% raw %}{%- elif target_throughput is string and target_throughput.lower() == 'none' %}{% endraw %} + {% raw %}{%- else %}{% endraw %} + ,"target-throughput": {% raw %}{{ target_throughput | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + {% raw %}{%-if search_clients is defined and search_clients %}{% endraw %} + ,"clients": {% raw %}{{ search_clients | tojson}}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + }, + { + "operation": "distance_amount_agg", + "warmup-iterations": 50, + "iterations": 50 + {% raw %}{%- if not target_throughput %}{% endraw %} + ,"target-throughput": 2 + {% raw %}{%- elif target_throughput is string and target_throughput.lower() == 'none' %}{% endraw %} + {% raw %}{%- else %}{% endraw %} + ,"target-throughput": {% raw %}{{ target_throughput | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + {% raw %}{%-if search_clients is defined and search_clients %}{% endraw %} + ,"clients": {% raw %}{{ search_clients | tojson}}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + }, + { + "operation": "autohisto_agg", + "warmup-iterations": 50, + "iterations": 100 + {% raw %}{%- if not target_throughput %}{% endraw %} + ,"target-throughput": 1.5 + {% raw %}{%- elif target_throughput is string and target_throughput.lower() == 'none' %}{% endraw %} + {% raw %}{%- else %}{% endraw %} + ,"target-throughput": {% raw %}{{ target_throughput | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + {% raw %}{%-if search_clients is defined and search_clients %}{% endraw %} + ,"clients": {% raw %}{{ search_clients | tojson}}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + }, + { + "operation": "date_histogram_agg", + "warmup-iterations": 50, + "iterations": 100 + {% raw %}{%- if not target_throughput %}{% endraw %} + ,"target-throughput": 1.5 + {% raw %}{%- elif target_throughput is string and target_throughput.lower() == 'none' %}{% endraw %} + {% raw %}{%- else %}{% endraw %} + ,"target-throughput": {% raw %}{{ target_throughput | tojson }}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + {% raw %}{%-if search_clients is defined and search_clients %}{% endraw %} + ,"clients": {% raw %}{{ search_clients | tojson}}{% endraw %} + {% raw %}{%- endif %}{% endraw %} + } + ] + } +``` + +## Next steps + +Now that you have familiarized yourself with the anatomy of a workload, see the criteria for [Choosing a workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/choosing-a-workload/). diff --git a/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md b/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md new file mode 100644 index 00000000..d7ae48ad --- /dev/null +++ b/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md @@ -0,0 +1,29 @@ +--- +layout: default +title: Choosing a workload +nav_order: 20 +grand_parent: User guide +parent: Understanding workloads +--- + +# Choosing a workload + +The [opensearch-benchmark-workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) repository contains a list of workloads that you can use to run your benchmarks. Using a workload similar to your cluster's use cases can save you time and effort when assessing your cluster's performance. + +For example, say you're a system architect at a rideshare company. As a rideshare company, you collect and store data based on trip times, locations, and other data related to each rideshare. Instead of building a custom workload and using your own data, which requires additional time, effort, and cost, you can use the [nyc_taxis](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/nyc_taxis) workload to benchmark your cluster because the data inside the workload is similar to the data that you collect. + +## Criteria for choosing a workload + +Consider the following criteria when deciding which workload would work best for benchmarking your cluster: + +- The cluster's use case. +- The data types that your cluster uses compared to the data structure of the documents contained in the workload. Each workload contains an example document so that you can compare data types, or you can view the index mappings and data types in the `index.json` file. +- The query types most commonly used inside your cluster. The `operations/default.json` file contains information about the query types and workload operations. + +## General search clusters + +For benchmarking clusters built for general search use cases, start with the `[nyc_taxis]`(https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/nyc_taxis) workload. This workload contains data about the rides taken in yellow taxis in New York City in 2015. + +## Log data + +For benchmarking clusters built for indexing and search with log data, use the [`http_logs`](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/http_logs) workload. This workload contains data about the 1998 World Cup. \ No newline at end of file diff --git a/_benchmark/user-guide/understanding-workloads/index.md b/_benchmark/user-guide/understanding-workloads/index.md new file mode 100644 index 00000000..844b5651 --- /dev/null +++ b/_benchmark/user-guide/understanding-workloads/index.md @@ -0,0 +1,14 @@ +--- +layout: default +title: Understanding workloads +nav_order: 7 +parent: User guide +has_children: true +--- + +# Understanding workloads + +OpenSearch Benchmark includes a set of [workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) that you can use to benchmark data from your cluster. Workloads contain descriptions of one or more benchmarking scenarios that use a specific document corpus to perform a benchmark against your cluster. The document corpus contains any indexes, data files, and operations invoked when the workflow runs. + + + diff --git a/_clients/OSC-dot-net.md b/_clients/OSC-dot-net.md new file mode 100644 index 00000000..9af9d428 --- /dev/null +++ b/_clients/OSC-dot-net.md @@ -0,0 +1,357 @@ +--- +layout: default +title: Getting started with the high-level .NET client +nav_order: 10 +has_children: false +parent: .NET clients +--- + +# Getting started with the high-level .NET client (OpenSearch.Client) + +OpenSearch.Client is a high-level .NET client. It provides strongly typed requests and responses as well as Query DSL. It frees you from constructing raw JSON requests and parsing raw JSON responses by providing models that parse and serialize/deserialize requests and responses automatically. OpenSearch.Client also exposes the OpenSearch.Net low-level client if you need it. For the client's complete API documentation, see the [OpenSearch.Client API documentation](https://opensearch-project.github.io/opensearch-net/api/OpenSearch.Client.html). + + +This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [opensearch-net repo](https://github.com/opensearch-project/opensearch-net). + +## Installing OpenSearch.Client + +To install OpenSearch.Client, download the [OpenSearch.Client NuGet package](https://www.nuget.org/packages/OpenSearch.Client/) and add it to your project in an IDE of your choice. In Microsoft Visual Studio, follow the steps below: +- In the **Solution Explorer** panel, right-click on your solution or project and select **Manage NuGet Packages for Solution**. +- Search for the OpenSearch.Client NuGet package, and select **Install**. + +Alternatively, you can add OpenSearch.Client to your .csproj file: +```xml + + ... + + + + +``` +{% include copy.html %} + +## Example + +The following example illustrates connecting to OpenSearch, indexing documents, and sending queries on the data. It uses the Student class to represent one student, which is equivalent to one document in the index. + +```cs +public class Student +{ + public int Id { get; init; } + public string FirstName { get; init; } + public string LastName { get; init; } + public int GradYear { get; init; } + public double Gpa { get; init; } +} +``` +{% include copy.html %} + +By default, OpenSearch.Client uses camel case to convert property names to field names. +{: .note} + +## Connecting to OpenSearch + +Use the default constructor when creating an OpenSearchClient object to connect to the default OpenSearch host (`http://localhost:9200`). + +```cs +var client = new OpenSearchClient(); +``` +{% include copy.html %} + +To connect to your OpenSearch cluster through a single node with a known address, specify this address when creating an instance of OpenSearch.Client: + +```cs +var nodeAddress = new Uri("http://myserver:9200"); +var client = new OpenSearchClient(nodeAddress); +``` +{% include copy.html %} + +You can also connect to OpenSearch through multiple nodes. Connecting to your OpenSearch cluster with a node pool provides advantages like load balancing and cluster failover support. To connect to your OpenSearch cluster using multiple nodes, specify their addresses and create a `ConnectionSettings` object for the OpenSearch.Client instance: + +```cs +var nodes = new Uri[] +{ + new Uri("http://myserver1:9200"), + new Uri("http://myserver2:9200"), + new Uri("http://myserver3:9200") +}; + +var pool = new StaticConnectionPool(nodes); +var settings = new ConnectionSettings(pool); +var client = new OpenSearchClient(settings); +``` +{% include copy.html %} + +## Using ConnectionSettings + +`ConnectionConfiguration` is used to pass configuration options to the low-level OpenSearch.Net client. `ConnectionSettings` inherits from `ConnectionConfiguration` and provides additional configuration options. +To set the address of the node and the default index name for requests that don't specify the index name, create a `ConnectionSettings` object: + +```cs +var node = new Uri("http://myserver:9200"); +var config = new ConnectionSettings(node).DefaultIndex("students"); +var client = new OpenSearchClient(config); +``` +{% include copy.html %} + +## Indexing one document + +Create one instance of Student: + +```cs +var student = new Student { Id = 100, FirstName = "Paulo", LastName = "Santos", Gpa = 3.93, GradYear = 2021 }; +``` +{% include copy.html %} + +To index one document, you can use either fluent lambda syntax or object initializer syntax. + +Index this Student into the `students` index using fluent lambda syntax: + +```cs +var response = client.Index(student, i => i.Index("students")); +``` +{% include copy.html %} + +Index this Student into the `students` index using object initializer syntax: + +```cs +var response = client.Index(new IndexRequest(student, "students")); +``` +{% include copy.html %} + +## Indexing many documents + +You can index many documents from a collection at the same time by using the OpenSearch.Client's `IndexMany` method: + +```cs +var studentArray = new Student[] +{ + new() {Id = 200, FirstName = "Shirley", LastName = "Rodriguez", Gpa = 3.91, GradYear = 2019}, + new() {Id = 300, FirstName = "Nikki", LastName = "Wolf", Gpa = 3.87, GradYear = 2020} +}; + +var manyResponse = client.IndexMany(studentArray, "students"); +``` +{% include copy.html %} + +## Searching for a document + +To search for a student indexed above, you want to construct a query that is analogous to the following Query DSL query: + +```json +GET students/_search +{ + "query" : { + "match": { + "lastName": "Santos" + } + } +} +``` + +The query above is a shorthand version of the following explicit query: + +```json +GET students/_search +{ + "query" : { + "match": { + "lastName": { + "query": "Santos" + } + } + } +} +``` + +In OpenSearch.Client, this query looks like this: + +```cs +var searchResponse = client.Search(s => s + .Index("students") + .Query(q => q + .Match(m => m + .Field(fld => fld.LastName) + .Query("Santos")))); +``` +{% include copy.html %} + +You can print out the results by accessing the documents in the response: + +```cs +if (searchResponse.IsValid) +{ + foreach (var s in searchResponse.Documents) + { + Console.WriteLine($"{s.Id} {s.LastName} {s.FirstName} {s.Gpa} {s.GradYear}"); + } +} +``` +{% include copy.html %} + +The response contains one document, which corresponds to the correct student: + +`100 Santos Paulo 3.93 2021` + +## Using OpenSearch.Client methods asynchronously + +For applications that require asynchronous code, all method calls in OpenSearch.Client have asynchronous counterparts: + +```cs +// synchronous method +var response = client.Index(student, i => i.Index("students")); + +// asynchronous method +var response = await client.IndexAsync(student, i => i.Index("students")); +``` + +## Falling back on the low-level OpenSearch.Net client + +OpenSearch.Client exposes the low-level the OpenSearch.Net client you can use if anything is missing: + +```cs +var lowLevelClient = client.LowLevel; + +var searchResponseLow = lowLevelClient.Search>("students", + PostData.Serializable( + new + { + query = new + { + match = new + { + lastName = new + { + query = "Santos" + } + } + } + })); + +if (searchResponseLow.IsValid) +{ + foreach (var s in searchResponseLow.Documents) + { + Console.WriteLine($"{s.Id} {s.LastName} {s.FirstName} {s.Gpa} {s.GradYear}"); + } +} +``` +{% include copy.html %} + +## Sample program + +The following is a complete sample program that illustrates all of the concepts described above. It uses the Student class defined above. + +```cs +using OpenSearch.Client; +using OpenSearch.Net; + +namespace NetClientProgram; + +internal class Program +{ + private static IOpenSearchClient osClient = new OpenSearchClient(); + + public static void Main(string[] args) + { + Console.WriteLine("Indexing one student......"); + var student = new Student { Id = 100, + FirstName = "Paulo", + LastName = "Santos", + Gpa = 3.93, + GradYear = 2021 }; + var response = osClient.Index(student, i => i.Index("students")); + Console.WriteLine(response.IsValid ? "Response received" : "Error"); + + Console.WriteLine("Searching for one student......"); + SearchForOneStudent(); + + Console.WriteLine("Searching using low-level client......"); + SearchLowLevel(); + + Console.WriteLine("Indexing an array of Student objects......"); + var studentArray = new Student[] + { + new() { Id = 200, + FirstName = "Shirley", + LastName = "Rodriguez", + Gpa = 3.91, + GradYear = 2019}, + new() { Id = 300, + FirstName = "Nikki", + LastName = "Wolf", + Gpa = 3.87, + GradYear = 2020} + }; + var manyResponse = osClient.IndexMany(studentArray, "students"); + Console.WriteLine(manyResponse.IsValid ? "Response received" : "Error"); + } + + private static void SearchForOneStudent() + { + var searchResponse = osClient.Search(s => s + .Index("students") + .Query(q => q + .Match(m => m + .Field(fld => fld.LastName) + .Query("Santos")))); + + PrintResponse(searchResponse); + } + + private static void SearchForAllStudentsWithANonEmptyLastName() + { + var searchResponse = osClient.Search(s => s + .Index("students") + .Query(q => q + .Bool(b => b + .Must(m => m.Exists(fld => fld.LastName)) + .MustNot(m => m.Term(t => t.Verbatim().Field(fld => fld.LastName).Value(string.Empty))) + ))); + + PrintResponse(searchResponse); + } + + private static void SearchLowLevel() + { + // Search for the student using the low-level client + var lowLevelClient = osClient.LowLevel; + + var searchResponseLow = lowLevelClient.Search> + ("students", + PostData.Serializable( + new + { + query = new + { + match = new + { + lastName = new + { + query = "Santos" + } + } + } + })); + + PrintResponse(searchResponseLow); + } + + private static void PrintResponse(ISearchResponse response) + { + if (response.IsValid) + { + foreach (var s in response.Documents) + { + Console.WriteLine($"{s.Id} {s.LastName} " + + $"{s.FirstName} {s.Gpa} {s.GradYear}"); + } + } + else + { + Console.WriteLine("Student not found."); + } + } +} +``` +{% include copy.html %} diff --git a/_clients/OSC-example.md b/_clients/OSC-example.md new file mode 100644 index 00000000..4511e2b4 --- /dev/null +++ b/_clients/OSC-example.md @@ -0,0 +1,328 @@ +--- +layout: default +title: More advanced features of the high-level .NET client +nav_order: 12 +has_children: false +parent: .NET clients +--- + +# More advanced features of the high-level .NET client (OpenSearch.Client) + +The following example illustrates more advanced features of OpenSearch.Client. For a simple example, see the [Getting started guide]({{site.url}}{{site.baseurl}}/clients/OSC-dot-net/). This example uses the following Student class. + +```cs +public class Student +{ + public int Id { get; init; } + public string FirstName { get; init; } + public string LastName { get; init; } + public int GradYear { get; init; } + public double Gpa { get; init; } +} +``` +{% include copy.html %} + +## Mappings + +OpenSearch uses dynamic mapping to infer field types of the documents that are indexed. However, to have more control over the schema of your document, you can pass an explicit mapping to OpenSearch. You can define data types for some or all fields of your document in this mapping. + +Similarly, OpenSearch.Client uses auto mapping to infer field data types based on the types of the class's properties. To use auto mapping, create a `students` index using the AutoMap's default constructor: + +```cs +var createResponse = await osClient.Indices.CreateAsync("students", + c => c.Map(m => m.AutoMap())); +``` +{% include copy.html %} + +If you use auto mapping, Id and GradYear are mapped as integers, Gpa is mapped as a double, and FirstName and LastName are mapped as text with a keyword subfield. If you want to search for FirstName and LastName and allow only case-sensitive full matches, you can suppress analyzing by mapping these fields as keyword only. In Query DSL, you can accomplish this using the following query: + +```json +PUT students +{ + "mappings" : { + "properties" : { + "firstName" : { + "type" : "keyword" + }, + "lastName" : { + "type" : "keyword" + } + } + } +} +``` + +In OpenSearch.Client, you can use fluid lambda syntax to mark these fields as keywords: + +```cs +var createResponse = await osClient.Indices.CreateAsync(index, + c => c.Map(m => m.AutoMap() + .Properties(p => p + .Keyword(k => k.Name(f => f.FirstName)) + .Keyword(k => k.Name(f => f.LastName))))); +``` +{% include copy.html %} + +## Settings + +In addition to mappings, you can specify settings like the number of primary and replica shards when creating an index. The following query sets the number of primary shards to 1 and the number of replica shards to 2: + +```json +PUT students +{ + "mappings" : { + "properties" : { + "firstName" : { + "type" : "keyword" + }, + "lastName" : { + "type" : "keyword" + } + } + }, + "settings": { + "number_of_shards": 1, + "number_of_replicas": 2 + } +} +``` + +In OpenSearch.Client, the equivalent of the above query is the following: + +```cs +var createResponse = await osClient.Indices.CreateAsync(index, + c => c.Map(m => m.AutoMap() + .Properties(p => p + .Keyword(k => k.Name(f => f.FirstName)) + .Keyword(k => k.Name(f => f.LastName)))) + .Settings(s => s.NumberOfShards(1).NumberOfReplicas(2))); +``` +{% include copy.html %} + +## Indexing multiple documents using the Bulk API + +In addition to indexing one document using `Index` and `IndexDocument` and indexing multiple documents using `IndexMany`, you can gain more control over document indexing by using `Bulk` or `BulkAll`. Indexing documents individually is inefficient because it creates an HTTP request for every document sent. The BulkAll helper frees you from handling retry, chunking or back off request functionality. It automatically retries if the request fails, backs off if the server is down, and controls how many documents are sent in one HTTP request. + +In the following example, `BulkAll` is configured with the index name, number of back off retries, and back off time. Additionally, the maximum degrees of parallelism setting controls the number of parallel HTTP requests containing the data. Finally, the size parameter signals how many documents are sent in one HTTP request. + +We recommend setting the size to 100–1000 documents in production. +{: .tip} + +`BulkAll` takes a stream of data and returns an Observable that you can use to observe the background operation. + +```cs +var bulkAll = osClient.BulkAll(ReadData(), r => r + .Index(index) + .BackOffRetries(2) + .BackOffTime("30s") + .MaxDegreeOfParallelism(4) + .Size(100)); +``` +{% include copy.html %} + +## Searching with Boolean query + +OpenSearch.Client exposes full OpenSearch query capability. In addition to simple searches that use the match query, you can create a more complex Boolean query to search for students who graduated in 2022 and sort them by last name. In the example below, search is limited to 10 documents, and the scroll API is used to control the pagination of results. + +```cs +var gradResponse = await osClient.SearchAsync(s => s + .Index(index) + .From(0) + .Size(10) + .Scroll("1m") + .Query(q => q + .Bool(b => b + .Filter(f => f + .Term(t => t.Field(fld => fld.GradYear).Value(2022))))) + .Sort(srt => srt.Ascending(f => f.LastName))); +``` +{% include copy.html %} + +The response contains the Documents property with matching documents from OpenSearch. The data is in the form of deserialized JSON objects of Student type, so you can access their properties in a strongly typed fashion. All serialization and deserialization is handled by OpenSearch.Client. + +## Aggregations + +OpenSearch.Client includes the full OpenSearch query functionality, including aggregations. In addition to grouping search results into buckets (for example, grouping students by GPA ranges), you can calculate metrics like sum or average. The following query calculates the average GPA of all students in the index. + +Setting Size to 0 means OpenSearch will only return the aggregation, not the actual documents. +{: .tip} + +```cs +var aggResponse = await osClient.SearchAsync(s => s + .Index(index) + .Size(0) + .Aggregations(a => a + .Average("average gpa", + avg => avg.Field(fld => fld.Gpa)))); +``` +{% include copy.html %} + +## Sample program for creating an index and indexing data + +The following program creates an index, reads a stream of student records from a comma-separated file and indexes this data into OpenSearch. + +```cs +using OpenSearch.Client; + +namespace NetClientProgram; + +internal class Program +{ + private const string index = "students"; + + public static IOpenSearchClient osClient = new OpenSearchClient(); + + public static async Task Main(string[] args) + { + // Check if the index with the name "students" exists + var existResponse = await osClient.Indices.ExistsAsync(index); + + if (!existResponse.Exists) // There is no index with this name + { + // Create an index "students" + // Map FirstName and LastName as keyword + var createResponse = await osClient.Indices.CreateAsync(index, + c => c.Map(m => m.AutoMap() + .Properties(p => p + .Keyword(k => k.Name(f => f.FirstName)) + .Keyword(k => k.Name(f => f.LastName)))) + .Settings(s => s.NumberOfShards(1).NumberOfReplicas(1))); + + if (!createResponse.IsValid && !createResponse.Acknowledged) + { + throw new Exception("Create response is invalid."); + } + + // Take a stream of data and send it to OpenSearch + var bulkAll = osClient.BulkAll(ReadData(), r => r + .Index(index) + .BackOffRetries(2) + .BackOffTime("20s") + .MaxDegreeOfParallelism(4) + .Size(10)); + + // Wait until the data upload is complete. + // FromMinutes specifies a timeout. + // r is a response object that is returned as the data is indexed. + bulkAll.Wait(TimeSpan.FromMinutes(10), r => + Console.WriteLine("Data chunk indexed")); + } + } + + // Reads student data in the form "Id,FirsName,LastName,GradYear,Gpa" + public static IEnumerable ReadData() + { + var file = new StreamReader("C:\\search\\students.csv"); + + string s; + while ((s = file.ReadLine()) is not null) + { + yield return new Student(s); + } + } +} +``` +{% include copy.html %} + +## Sample program for search + +The following program searches students by name and graduation date and calculates the average GPA. + +```cs +using OpenSearch.Client; + +namespace NetClientProgram; + +internal class Program +{ + private const string index = "students"; + + public static IOpenSearchClient osClient = new OpenSearchClient(); + + public static async Task Main(string[] args) + { + await SearchByName(); + + await SearchByGradDate(); + + await CalculateAverageGpa(); + } + + private static async Task SearchByName() + { + Console.WriteLine("Searching for name......"); + + var nameResponse = await osClient.SearchAsync(s => s + .Index(index) + .Query(q => q + .Match(m => m + .Field(fld => fld.FirstName) + .Query("Zhang")))); + + if (!nameResponse.IsValid) + { + throw new Exception("Aggregation query response is not valid."); + } + + foreach (var s in nameResponse.Documents) + { + Console.WriteLine($"{s.Id} {s.LastName} " + + $"{s.FirstName} {s.Gpa} {s.GradYear}"); + } + } + + private static async Task SearchByGradDate() + { + Console.WriteLine("Searching for grad date......"); + + // Search for all students who graduated in 2022 + var gradResponse = await osClient.SearchAsync(s => s + .Index(index) + .From(0) + .Size(2) + .Scroll("1m") + .Query(q => q + .Bool(b => b + .Filter(f => f + .Term(t => t.Field(fld => fld.GradYear).Value(2022))))) + .Sort(srt => srt.Ascending(f => f.LastName)) + .Size(10)); + + + if (!gradResponse.IsValid) + { + throw new Exception("Grad date query response is not valid."); + } + + while (gradResponse.Documents.Any()) + { + foreach (var data in gradResponse.Documents) + { + Console.WriteLine($"{data.Id} {data.LastName} {data.FirstName} " + + $"{data.Gpa} {data.GradYear}"); + } + gradResponse = osClient.Scroll("1m", gradResponse.ScrollId); + } + } + + public static async Task CalculateAverageGpa() + { + Console.WriteLine("Calculating average GPA......"); + + // Search and aggregate + // Size 0 means documents are not returned, only aggregation is returned + var aggResponse = await osClient.SearchAsync(s => s + .Index(index) + .Size(0) + .Aggregations(a => a + .Average("average gpa", + avg => avg.Field(fld => fld.Gpa)))); + + if (!aggResponse.IsValid) throw new Exception("Aggregation response not valid"); + + var avg = aggResponse.Aggregations.Average("average gpa").Value; + Console.WriteLine($"Average GPA is {avg}"); + } +} +``` +{% include copy.html %} \ No newline at end of file diff --git a/_clients/OpenSearch-dot-net.md b/_clients/OpenSearch-dot-net.md new file mode 100644 index 00000000..86488928 --- /dev/null +++ b/_clients/OpenSearch-dot-net.md @@ -0,0 +1,474 @@ +--- +layout: default +title: Low-level .NET client +nav_order: 30 +has_children: false +parent: .NET clients +--- + +# Low-level .NET client (OpenSearch.Net) + +OpenSearch.Net is a low-level .NET client that provides the foundational layer of communication with OpenSearch. It is dependency free, and it can handle round-robin load balancing, transport, and the basic request/response cycle. OpenSearch.Net contains all OpenSearch API endpoints as methods. When using OpenSearch.Net, you need to construct the queries yourself. + +This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [opensearch-net repo](https://github.com/opensearch-project/opensearch-net). + +## Stable Release + +This documentation reflects the latest updates available in the [GitHub repository](https://github.com/opensearch-project/opensearch-net) and may include changes unavailable in the current stable release. The current stable release in NuGet is [1.2.0](https://www.nuget.org/packages/OpenSearch.Net.Auth.AwsSigV4/1.2.0). + +## Example + +The following example illustrates connecting to OpenSearch, indexing documents, and sending queries on the data. It uses the Student class to represent one student, which is equivalent to one document in the index. + +```cs +public class Student +{ + public int Id { get; init; } + public string FirstName { get; init; } + public string LastName { get; init; } + public int GradYear { get; init; } + public double Gpa { get; init; } +} +``` +{% include copy.html %} + +## Installing the Opensearch.Net client + +To install Opensearch.Net, download the [Opensearch.Net NuGet package](https://www.nuget.org/packages/OpenSearch.Net) and add it to your project in an IDE of your choice. In Microsoft Visual Studio, follow the steps below: +- In the **Solution Explorer** panel, right-click on your solution or project and select **Manage NuGet Packages for Solution**. +- Search for the OpenSearch.Net NuGet package, and select **Install**. + +Alternatively, you can add OpenSearch.Net to your .csproj file: + +```xml + + ... + + + + +``` +{% include copy.html %} + +## Connecting to OpenSearch + +Use the default constructor when creating an OpenSearchLowLevelClient object to connect to the default OpenSearch host (`http://localhost:9200`). + +```cs +var client = new OpenSearchLowLevelClient(); +``` +{% include copy.html %} + +To connect to your OpenSearch cluster through a single node with a known address, create a ConnectionConfiguration object with that address and pass it to the OpenSearch.Net constructor: + +```cs +var nodeAddress = new Uri("http://myserver:9200"); +var config = new ConnectionConfiguration(nodeAddress); +var client = new OpenSearchLowLevelClient(config); +``` +{% include copy.html %} + +You can also use a [connection pool]({{site.url}}{{site.baseurl}}/clients/dot-net-conventions#connection-pools) to manage the nodes in the cluster. Additionally, you can set up a connection configuration to have OpenSearch return the response as formatted JSON. + +```cs +var uri = new Uri("http://localhost:9200"); +var connectionPool = new SingleNodeConnectionPool(uri); +var settings = new ConnectionConfiguration(connectionPool).PrettyJson(); +var client = new OpenSearchLowLevelClient(settings); +``` +{% include copy.html %} + +To connect to your OpenSearch cluster using multiple nodes, create a connection pool with their addresses. In this example, a [`SniffingConnectionPool`]({{site.url}}{{site.baseurl}}/clients/dot-net-conventions#connection-pools) is used because it keeps track of nodes being removed or added to the cluster, so it works best for clusters that scale automatically. + +```cs +var uris = new[] +{ + new Uri("http://localhost:9200"), + new Uri("http://localhost:9201"), + new Uri("http://localhost:9202") +}; +var connectionPool = new SniffingConnectionPool(uris); +var settings = new ConnectionConfiguration(connectionPool).PrettyJson(); +var client = new OpenSearchLowLevelClient(settings); +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Service + +The following example illustrates connecting to Amazon OpenSearch Service: + +```cs +using OpenSearch.Client; +using OpenSearch.Net.Auth.AwsSigV4; + +namespace Application +{ + class Program + { + static void Main(string[] args) + { + var endpoint = new Uri("https://search-xxx.region.es.amazonaws.com"); + var connection = new AwsSigV4HttpConnection(RegionEndpoint.APSoutheast2, service: AwsSigV4HttpConnection.OpenSearchService); + var config = new ConnectionSettings(endpoint, connection); + var client = new OpenSearchClient(config); + + Console.WriteLine($"{client.RootNodeInfo().Version.Distribution}: {client.RootNodeInfo().Version.Number}"); + } + } +} +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Serverless + +The following example illustrates connecting to Amazon OpenSearch Serverless Service: + +```cs +using OpenSearch.Client; +using OpenSearch.Net.Auth.AwsSigV4; + +namespace Application +{ + class Program + { + static void Main(string[] args) + { + var endpoint = new Uri("https://search-xxx.region.aoss.amazonaws.com"); + var connection = new AwsSigV4HttpConnection(RegionEndpoint.APSoutheast2, service: AwsSigV4HttpConnection.OpenSearchServerlessService); + var config = new ConnectionSettings(endpoint, connection); + var client = new OpenSearchClient(config); + + Console.WriteLine($"{client.RootNodeInfo().Version.Distribution}: {client.RootNodeInfo().Version.Number}"); + } + } +} +``` +{% include copy.html %} + + +## Using ConnectionSettings + +`ConnectionConfiguration` is used to pass configuration options to the OpenSearch.Net client. `ConnectionSettings` inherits from `ConnectionConfiguration` and provides additional configuration options. +The following example uses `ConnectionSettings` to: +- Set the default index name for requests that don't specify the index name. +- Enable gzip-compressed requests and responses. +- Signal to OpenSearch to return formatted JSON. +- Make field names lowercase. + +```cs +var uri = new Uri("http://localhost:9200"); +var connectionPool = new SingleNodeConnectionPool(uri); +var settings = new ConnectionSettings(connectionPool) + .DefaultIndex("students") + .EnableHttpCompression() + .PrettyJson() + .DefaultFieldNameInferrer(f => f.ToLower()); + +var client = new OpenSearchLowLevelClient(settings); +``` +{% include copy.html %} + +## Indexing one document + +To index a document, you first need to create an instance of the Student class: + +```cs +var student = new Student { + Id = 100, + FirstName = "Paulo", + LastName = "Santos", + Gpa = 3.93, + GradYear = 2021 +}; +``` +{% include copy.html %} + +Alternatively, you can create an instance of Student using an anonymous type: + +```cs +var student = new { + Id = 100, + FirstName = "Paulo", + LastName = "Santos", + Gpa = 3.93, + GradYear = 2021 +}; +``` +{% include copy.html %} + +Next, upload this Student into the `students` index using the `Index` method: + +```cs +var response = client.Index("students", "100", + PostData.Serializable(student)); +Console.WriteLine(response.Body); +``` +{% include copy.html %} + +The generic type parameter of the `Index` method specifies the response body type. In the example above, the response is a string. + +## Indexing many documents using the Bulk API + +To index many documents, use the Bulk API to bundle many operations into one request: + +```cs +var studentArray = new object[] +{ + new {index = new { _index = "students", _type = "_doc", _id = "200"}}, + new { Id = 200, + FirstName = "Shirley", + LastName = "Rodriguez", + Gpa = 3.91, + GradYear = 2019 + }, + new {index = new { _index = "students", _type = "_doc", _id = "300"}}, + new { Id = 300, + FirstName = "Nikki", + LastName = "Wolf", + Gpa = 3.87, + GradYear = 2020 + } +}; + +var manyResponse = client.Bulk(PostData.MultiJson(studentArray)); +``` +{% include copy.html %} + +You can send the request body as an anonymous object, string, byte array, or stream in APIs that take a body. For APIs that take multiline JSON, you can send the body as a list of bytes or a list of objects, like in the example above. The `PostData` class has static methods to send the body in all of these forms. + +## Searching for a document + +To construct a Query DSL query, use anonymous types within the request body. The following query searches for all students who graduated in 2021: + +```cs +var searchResponseLow = client.Search("students", + PostData.Serializable( + new + { + from = 0, + size = 20, + + query = new + { + term = new + { + gradYear = new + { + value = 2019 + } + } + } + })); + +Console.WriteLine(searchResponseLow.Body); +``` +{% include copy.html %} + +Alternatively, you can use strings to construct the request. When using strings, you have to escape the `"` character: + +```cs +var searchResponse = client.Search("students", + @" { + ""query"": + { + ""match"": + { + ""lastName"": + { + ""query"": ""Santos"" + } + } + } + }"); + +Console.WriteLine(searchResponse.Body); +``` +{% include copy.html %} + +## Using OpenSearch.Net methods asynchronously + +For applications that require asynchronous code, all method calls in OpenSearch.Client have asynchronous counterparts: + +```cs +// synchronous method +var response = client.Index("students", "100", + PostData.Serializable(student)); + +// asynchronous method +var response = client.IndexAsync("students", "100", + PostData.Serializable(student)); +``` +{% include copy.html %} + +## Handling exceptions + +By default, OpenSearch.Net does not throw exceptions when an operation is unsuccessful. In particular, OpenSearch.Net does not throw exceptions if the response status code has one of the expected values for this request. For example, the following query searches for a document in an index that does not exist: + +```cs +var searchResponse = client.Search("students1", + @" { + ""query"": + { + ""match"": + { + ""lastName"": + { + ""query"": ""Santos"" + } + } + } + }"); + +Console.WriteLine(searchResponse.Body); +``` +{% include copy.html %} + +The response contains an error status code 404, which is one of the expected error codes for search requests, so no exception is thrown. You can see the status code in the `status` field: + +```json +{ + "error" : { + "root_cause" : [ + { + "type" : "index_not_found_exception", + "reason" : "no such index [students1]", + "index" : "students1", + "resource.id" : "students1", + "resource.type" : "index_or_alias", + "index_uuid" : "_na_" + } + ], + "type" : "index_not_found_exception", + "reason" : "no such index [students1]", + "index" : "students1", + "resource.id" : "students1", + "resource.type" : "index_or_alias", + "index_uuid" : "_na_" + }, + "status" : 404 +} +``` + +To configure OpenSearch.Net to throw exceptions, turn on the `ThrowExceptions()` setting on `ConnectionConfiguration`: + +```cs +var uri = new Uri("http://localhost:9200"); +var connectionPool = new SingleNodeConnectionPool(uri); +var settings = new ConnectionConfiguration(connectionPool) + .PrettyJson().ThrowExceptions(); +var client = new OpenSearchLowLevelClient(settings); +``` +{% include copy.html %} + +You can use the following properties of the response object to determine response success: + +```cs +Console.WriteLine("Success: " + searchResponse.Success); +Console.WriteLine("SuccessOrKnownError: " + searchResponse.SuccessOrKnownError); +Console.WriteLine("Original Exception: " + searchResponse.OriginalException); +``` + +- `Success` returns true if the response code is in the 2xx range or the response code has one of the expected values for this request. +- `SuccessOrKnownError` returns true if the response is successful or the response code is in the 400–501 or 505–599 ranges. If SuccessOrKnownError is true, the request is not retried. +- `OriginalException` holds the original exception for the unsuccessful responses. + +## Sample program + +The following program creates an index, indexes data, and searches for documents. + +```cs +using OpenSearch.Net; +using OpenSearch.Client; + +namespace NetClientProgram; + +internal class Program +{ + public static void Main(string[] args) + { + // Create a client with custom settings + var uri = new Uri("http://localhost:9200"); + var connectionPool = new SingleNodeConnectionPool(uri); + var settings = new ConnectionSettings(connectionPool) + .PrettyJson(); + var client = new OpenSearchLowLevelClient(settings); + + + Console.WriteLine("Indexing one student......"); + var student = new Student { + Id = 100, + FirstName = "Paulo", + LastName = "Santos", + Gpa = 3.93, + GradYear = 2021 }; + var response = client.Index("students", "100", + PostData.Serializable(student)); + Console.WriteLine(response.Body); + + Console.WriteLine("Indexing many students......"); + var studentArray = new object[] + { + new { index = new { _index = "students", _type = "_doc", _id = "200"}}, + new { + Id = 200, + FirstName = "Shirley", + LastName = "Rodriguez", + Gpa = 3.91, + GradYear = 2019}, + new { index = new { _index = "students", _type = "_doc", _id = "300"}}, + new { + Id = 300, + FirstName = "Nikki", + LastName = "Wolf", + Gpa = 3.87, + GradYear = 2020} + }; + + var manyResponse = client.Bulk(PostData.MultiJson(studentArray)); + + Console.WriteLine(manyResponse.Body); + + + Console.WriteLine("Searching for students who graduated in 2019......"); + var searchResponseLow = client.Search("students", + PostData.Serializable( + new + { + from = 0, + size = 20, + + query = new + { + term = new + { + gradYear = new + { + value = 2019 + } + } + } + })); + + Console.WriteLine(searchResponseLow.Body); + + Console.WriteLine("Searching for a student with the last name Santos......"); + + var searchResponse = client.Search("students", + @" { + ""query"": + { + ""match"": + { + ""lastName"": + { + ""query"": ""Santos"" + } + } + } + }"); + + Console.WriteLine(searchResponse.Body); + } +} +``` +{% include copy.html %} diff --git a/_clients/agents-and-ingestion-tools/index.md b/_clients/agents-and-ingestion-tools/index.md deleted file mode 100644 index 04adfb1e..00000000 --- a/_clients/agents-and-ingestion-tools/index.md +++ /dev/null @@ -1,87 +0,0 @@ ---- -layout: default -title: Agents and ingestion tools -nav_order: 100 -has_children: false -has_toc: false -redirect_from: - - /clients/agents-and-ingestion-tools/ ---- - -# Agents and ingestion tools - -Historically, many multiple popular agents and ingestion tools have worked with Elasticsearch OSS, such as Beats, Logstash, Fluentd, FluentBit, and OpenTelemetry. OpenSearch aims to continue to support a broad set of agents and ingestion tools, but not all have been tested or have explicitly added OpenSearch compatibility. - -As an intermediate compatibility solution, OpenSearch has a setting that instructs the cluster to return version 7.10.2 rather than its actual version. - -If you use clients that include a version check, such as recent versions of Logstash OSS or Filebeat OSS, enable the setting: - -```json -PUT _cluster/settings -{ - "persistent": { - "compatibility": { - "override_main_response_version": true - } - } -} -``` - -[Just like any other setting]({{site.url}}{{site.baseurl}}/opensearch/configuration/), the alternative is to add the following line to `opensearch.yml` on each node and then restart the node: - -```yml -compatibility.override_main_response_version: true -``` - - -## Downloads - -You can download the OpenSearch output plugin for Logstash from [OpenSearch downloads](https://opensearch.org/downloads.html). The Logstash output plugin is compatible with OpenSearch and Elasticsearch OSS (7.10.2 or lower). - -These are the latest versions of Beats OSS with OpenSearch compatibility. For more information, see the [compatibility matrices](#compatibility-matrices). - -- [Filebeat OSS 7.12.1](https://www.elastic.co/downloads/past-releases/filebeat-oss-7-12-1) -- [Metricbeat OSS 7.12.1](https://www.elastic.co/downloads/past-releases/metricbeat-oss-7-12-1) -- [Packetbeat OSS 7.12.1](https://www.elastic.co/downloads/past-releases/packetbeat-oss-7-12-1) -- [Heartbeat OSS 7.12.1](https://elastic.co/downloads/past-releases/heartbeat-oss-7-12-1) -- [Winlogbeat OSS 7.12.1](https://www.elastic.co/downloads/past-releases/winlogbeat-oss-7-12-1) -- [Auditbeat OSS 7.12.1](https://elastic.co/downloads/past-releases/auditbeat-oss-7-12-1) - -Some users report compatibility issues with ingest pipelines on these versions of Beats. If you use ingest pipelines with OpenSearch, consider using the 7.10.2 versions of Beats instead. -{: .note } - - -## Compatibility Matrices - -*Italicized* cells are untested, but indicate what a value theoretically should be based on existing information. - - -### Compatibility Matrix for Logstash - -| | Logstash OSS 7.x to 7.11.x | Logstash OSS 7.12.x\* | Logstash 7.13.x without OpenSearch output plugin | Logstash 7.13.x with OpenSearch output plugin | -| :---| :--- | :--- | :--- | :--- | -| Elasticsearch OSS 7.x to 7.9.x | *Yes* | *Yes* | *No* | *Yes* | -| Elasticsearch OSS 7.10.2 | *Yes* | *Yes* | *No* | *Yes* | -| ODFE 1.x to 1.12 | *Yes* | *Yes* | *No* | *Yes* | -| ODFE 1.13 | *Yes* | *Yes* | *No* | *Yes* | -| OpenSearch 1.0 | Yes via version setting | Yes via version setting | *No* | *Yes* | - -\* Most current compatible version with Elasticsearch OSS. - - -### Compatibility Matrix for Beats - -| | Beats OSS 7.x to 7.11.x\*\* | Beats OSS 7.12.x\* | Beats 7.13.x | -| :--- | :--- | :--- | :--- | -| Elasticsearch OSS 7.x to 7.9.x | *Yes* | *Yes* | No | -| Elasticsearch OSS 7.10.2 | *Yes* | *Yes* | No | -| ODFE 1.x to 1.12 | *Yes* | *Yes* | No | -| ODFE 1.13 | *Yes* | *Yes* | No | -| OpenSearch 1.0 | Yes via version setting | Yes via version setting | No | -| Logstash OSS 7.x to 7.11.x | *Yes* | *Yes* | *Yes* | -| Logstash OSS 7.12.x\* | *Yes* | *Yes* | *Yes* | -| Logstash 7.13.x with OpenSearch output plugin | *Yes* | *Yes* | *Yes* | - -\* Most current compatible version with Elasticsearch OSS. - -\*\* Beats OSS includes all Apache 2.0 Beats agents (i.e. Filebeat, Metricbeat, Auditbeat, Heartbeat, Winlogbeat, Packetbeat). diff --git a/_clients/dot-net-conventions.md b/_clients/dot-net-conventions.md new file mode 100644 index 00000000..a0ee0295 --- /dev/null +++ b/_clients/dot-net-conventions.md @@ -0,0 +1,95 @@ +--- +layout: default +title: .NET client considerations +nav_order: 20 +has_children: false +parent: .NET clients +--- + +# .NET client considerations and best practices + +The following sections provide information regarding the considerations and best practices for using .NET clients. + +## Registering OpenSearch.Client as a singleton + +As a rule, you should set up your OpenSearch.Client as a singleton. OpenSearch.Client manages connections to the server and the states of the nodes in a cluster. Additionally, each client uses a lot of configuration for its setup. Therefore, it is beneficial to create an OpenSearch.Client instance once and reuse it for all OpenSearch operations. The client is thread safe, so the same instance can be shared by multiple threads. + +## Exceptions + +The following are the types of exceptions that may be thrown by .NET clients: + +- `OpenSearchClientException` is a known exception that occurs either in the request pipeline (for example, timeout reached) or in OpenSearch (for example, malformed query). If it is an OpenSearch exception, the `ServerError` response property contains the error that OpenSearch returns. +- `UnexpectedOpenSearchClientException` is an unknown exception (for example, an error during deserialization) and is a subclass of OpenSearchClientException. +- System exceptions are thrown when the API is not used properly. + +## Nodes + +To create a node, pass a `Uri` object into its constructor: + +```cs +var uri = new Uri("http://example.org/opensearch"); +var node = new Node(uri); +``` +{% include copy.html %} + +When first created, a node is master eligible, and its `HoldsData` property is set to true. +The `AbsolutePath` property of the node created above is `"/opensearch/"`: A trailing forward slash is appended so that the paths can be easily combined. If not specified, the default `Port` is 80. + +Nodes are considered equal if they have the same endpoint. Metadata is not taken into account when checking nodes for equality. +{: .note} + +## Connection pools + +Connection pools are instances of `IConnectionPool` and are responsible for managing the nodes in the OpenSearch cluster. We recommend creating a [singleton client](#registering-opensearchclient-as-a-singleton) with a single `ConnectionSettings` object. The lifetime of both the client and its `ConnectionSettings` is the lifetime of the application. + +The following are connection pool types. + +- **SingleNodeConnectionPool** + +`SingleNodeConnectionPool` is the default connection pool that is used if no connection pool is passed to the `ConnectionSettings` constructor. Use `SingleNodeConnectionPool` if you have only one node in the cluster or if your cluster has a load balancer as an entry point. `SingleNodeConnectionPool` does not support sniffing or pinging and does not mark nodes as dead or alive. + +- **CloudConnectionPool** + +`CloudConnectionPool` is a subclass of `SingleNodeConnectionPool` that takes a Cloud ID and credentials. Like `SingleNodeConnectionPool`, `CloudConnectionPool` does not support sniffing or pinging. + +- **StaticConnectionPool** + +`StaticConnectionPool` is used for a small cluster when you do not want to turn on sniffing to learn about cluster topology. `StaticConnectionPool` does not support sniffing, but can support pinging. + +- **SniffingConnectionPool** + +`SniffingConnectionPool` is a subclass of `StaticConnectionPool`. It is thread safe and supports sniffing and pinging. `SniffingConnectionPool` can be reseeded at run time, and you can specify node roles when seeding. + +- **StickyConnectionPool** + +`StickyConnectionPool` is set up to return the first live node, which then persists between requests. It can be seeded using an enumerable of `Uri` or `Node` objects. `StickyConnectionPool` does not support sniffing but supports pinging. + +- **StickySniffingConnectionPool** + +`StickySniffingConnectionPool` is a subclass of `SniffingConnectionPool`. Like `StickyConnectionPool`, it returns the first live node2, which then persists between requests. `StickySniffingConnectionPool` supports sniffing and sorting so that each instance of your application can favor a different node. Nodes have weights associated with them and can be sorted by weight. + +## Retries + +If a request does not succeed, it is automatically retried. By default, the number of retries is the number of nodes known to OpenSearch.Client in your cluster. The number of retries is also limited by the timeout parameter, so OpenSearch.Client retries requests as many times as possible within the timeout period. + +To set the maximum number of retries, specify the number in the `MaximumRetries` property on the `ConnectionSettings` object. + +```cs +var settings = new ConnectionSettings(connectionPool).MaximumRetries(5); +``` +{% include copy.html %} + +You can also set a `RequestTimeout` that specifies a timeout for a single request and a `MaxRetryTimeout` that specifies the time limit for all retry attempts. In the example below, `RequestTimeout` is set to 4 seconds, and `MaxRetryTimeout` is set to 12 seconds, so the maximum number of attempts for a query is 3. + +```cs +var settings = new ConnectionSettings(connectionPool) + .RequestTimeout(TimeSpan.FromSeconds(4)) + .MaxRetryTimeout(TimeSpan.FromSeconds(12)); +``` +{% include copy.html %} + +## Failover + +If you are using a connection pool with multiple nodes, a request is retried if it returns a 502 (Bad Gateway), 503 (Service Unavailable), or 504 (Gateway Timeout) HTTP error response code. If the response code is an error code in the 400–501 or 505–599 ranges, the request is not retried. + +A response is considered valid if the response code is in the 2xx range or the response code has one of the expected values for this request. For example, 404 (Not Found) is a valid response for a request that checks whether an index exists. \ No newline at end of file diff --git a/_clients/dot-net.md b/_clients/dot-net.md new file mode 100644 index 00000000..8e212e0e --- /dev/null +++ b/_clients/dot-net.md @@ -0,0 +1,23 @@ +--- +layout: default +title: .NET clients +nav_order: 75 +has_children: true +has_toc: false +--- + +# .NET clients + +OpenSearch has two .NET clients: a low-level [OpenSearch.Net]({{site.url}}{{site.baseurl}}/clients/OpenSearch-dot-net/) client and a high-level [OpenSearch.Client]({{site.url}}{{site.baseurl}}/clients/OSC-dot-net/) client. + +[OpenSearch.Net]({{site.url}}{{site.baseurl}}/clients/OpenSearch-dot-net/) is a low-level .NET client that provides the foundational layer of communication with OpenSearch. It is dependency free, and it can handle round-robin load balancing, transport, and the basic request/response cycle. OpenSearch.Net contains methods for all OpenSearch API endpoints. + +[OpenSearch.Client]({{site.url}}{{site.baseurl}}/clients/OSC-dot-net/) is a high-level .NET client on top of OpenSearch.Net. It provides strongly typed requests and responses as well as Query DSL. It frees you from constructing raw JSON requests and parsing raw JSON responses by supplying models that parse and serialize/deserialize requests and responses automatically. OpenSearch.Client also exposes the OpenSearch.Net low-level client if you need it. OpenSearch.Client includes the following advanced functionality: + +- Automapping: Given a C# type, OpenSearch.Client can infer the correct mapping to send to OpenSearch. +- Operator overloading in queries. +- Type and index inference. + +You can use both .NET clients in a console program, a .NET core, an ASP.NET core, or in worker services. + +To get started with OpenSearch.Client, follow the instructions in [Getting started with the high-level .NET client]({{site.url}}{{site.baseurl}}/clients/OSC-dot-net#installing-opensearchclient) or in [More advanced features of the high-level .NET client]({{site.url}}{{site.baseurl}}/clients/OSC-example), a slightly more advanced walkthrough. \ No newline at end of file diff --git a/_clients/go.md b/_clients/go.md index 75ee300e..4e7de566 100644 --- a/_clients/go.md +++ b/_clients/go.md @@ -1,31 +1,330 @@ --- layout: default title: Go client -nav_order: 80 +nav_order: 50 --- # Go client -The OpenSearch Go client lets you connect your Go application with the data in your OpenSearch cluster. +The OpenSearch Go client lets you connect your Go application with the data in your OpenSearch cluster. This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client's complete API documentation and additional examples, see the [Go client API documentation](https://pkg.go.dev/github.com/opensearch-project/opensearch-go/v2). + +For the client source code, see the [opensearch-go repo](https://github.com/opensearch-project/opensearch-go). ## Setup -If you're creating a new project: +If you're starting a new project, create a new module by running the following command: ```go -go mod init +go mod init ``` +{% include copy.html %} -To add the client to your project, import it like any other module: +To add the Go client to your project, import it like any other module: ```go go get github.com/opensearch-project/opensearch-go ``` +{% include copy.html %} -## Sample code +## Connecting to OpenSearch -This sample code creates a client, adds an index with non-default settings, inserts a document, searches for the document, deletes the document, and finally deletes the index: +To connect to the default OpenSearch host, create a client object with the address `https://localhost:9200` if you are using the Security plugin: + +```go +client, err := opensearch.NewClient(opensearch.Config{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + }, + Addresses: []string{"https://localhost:9200"}, + Username: "admin", // For testing only. Don't store credentials in code. + Password: "admin", + }) +``` +{% include copy.html %} + +If you are not using the Security plugin, create a client object with the address `http://localhost:9200`: + +```go +client, err := opensearch.NewClient(opensearch.Config{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + }, + Addresses: []string{"http://localhost:9200"}, + }) +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Service + +The following example illustrates connecting to Amazon OpenSearch Service: + +```go +package main + +import ( + "context" + "log" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + opensearch "github.com/opensearch-project/opensearch-go/v2" + opensearchapi "github.com/opensearch-project/opensearch-go/v2/opensearchapi" + requestsigner "github.com/opensearch-project/opensearch-go/v2/signer/awsv2" +) + +const endpoint = "" // e.g. https://opensearch-domain.region.com or Amazon OpenSearch Serverless endpoint + +func main() { + ctx := context.Background() + + awsCfg, err := config.LoadDefaultConfig(ctx, + config.WithRegion(""), + config.WithCredentialsProvider( + getCredentialProvider("", "", ""), + ), + ) + if err != nil { + log.Fatal(err) // Do not log.fatal in a production ready app. + } + + // Create an AWS request Signer and load AWS configuration using default config folder or env vars. + signer, err := requestsigner.NewSignerWithService(awsCfg, "es") + if err != nil { + log.Fatal(err) // Do not log.fatal in a production ready app. + } + + // Create an opensearch client and use the request-signer + client, err := opensearch.NewClient(opensearch.Config{ + Addresses: []string{endpoint}, + Signer: signer, + }) + if err != nil { + log.Fatal("client creation err", err) + } +} + +func getCredentialProvider(accessKey, secretAccessKey, token string) aws.CredentialsProviderFunc { + return func(ctx context.Context) (aws.Credentials, error) { + c := &aws.Credentials{ + AccessKeyID: accessKey, + SecretAccessKey: secretAccessKey, + SessionToken: token, + } + return *c, nil + } +} +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Serverless + +The following example illustrates connecting to Amazon OpenSearch Serverless Service: + +```go +package main + +import ( + "context" + "log" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + opensearch "github.com/opensearch-project/opensearch-go/v2" + opensearchapi "github.com/opensearch-project/opensearch-go/v2/opensearchapi" + requestsigner "github.com/opensearch-project/opensearch-go/v2/signer/awsv2" +) + +const endpoint = "" // e.g. https://opensearch-domain.region.com or Amazon OpenSearch Serverless endpoint + +func main() { + ctx := context.Background() + + awsCfg, err := config.LoadDefaultConfig(ctx, + config.WithRegion(""), + config.WithCredentialsProvider( + getCredentialProvider("", "", ""), + ), + ) + if err != nil { + log.Fatal(err) // Do not log.fatal in a production ready app. + } + + // Create an AWS request Signer and load AWS configuration using default config folder or env vars. + signer, err := requestsigner.NewSignerWithService(awsCfg, "aoss") + if err != nil { + log.Fatal(err) // Do not log.fatal in a production ready app. + } + + // Create an opensearch client and use the request-signer + client, err := opensearch.NewClient(opensearch.Config{ + Addresses: []string{endpoint}, + Signer: signer, + }) + if err != nil { + log.Fatal("client creation err", err) + } +} + +func getCredentialProvider(accessKey, secretAccessKey, token string) aws.CredentialsProviderFunc { + return func(ctx context.Context) (aws.Credentials, error) { + c := &aws.Credentials{ + AccessKeyID: accessKey, + SecretAccessKey: secretAccessKey, + SessionToken: token, + } + return *c, nil + } +} +``` +{% include copy.html %} + +The Go client constructor takes an `opensearch.Config{}` type, which can be customized using options such as a list of OpenSearch node addresses or a username and password combination. + +To connect to multiple OpenSearch nodes, specify them in the `Addresses` parameter: + +```go +var ( + urls = []string{"http://localhost:9200", "http://localhost:9201", "http://localhost:9202"} +) + +client, err := opensearch.NewClient(opensearch.Config{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + }, + Addresses: urls, +}) +``` +{% include copy.html %} + +The Go client retries requests for a maximum of three times by default. To customize the number of retries, set the `MaxRetries` parameter. Additionally, you can change the list of response codes for which a request is retried by setting the `RetryOnStatus` parameter. The following code snippet creates a new Go client with custom `MaxRetries` and `RetryOnStatus` values: + +```go +client, err := opensearch.NewClient(opensearch.Config{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + }, + Addresses: []string{"http://localhost:9200"}, + MaxRetries: 5, + RetryOnStatus: []int{502, 503, 504}, + }) +``` +{% include copy.html %} + +## Creating an index + +To create an OpenSearch index, use the `IndicesCreateRequest` method. You can use the following code to construct a JSON object with custom settings : + +```go +settings := strings.NewReader(`{ + 'settings': { + 'index': { + 'number_of_shards': 1, + 'number_of_replicas': 0 + } + } + }`) + +res := opensearchapi.IndicesCreateRequest{ + Index: "go-test-index1", + Body: settings, +} +``` +{% include copy.html %} + +## Indexing a document + +You can index a document into OpenSearch using the `IndexRequest` method: + +```go +document := strings.NewReader(`{ + "title": "Moneyball", + "director": "Bennett Miller", + "year": "2011" +}`) + +docId := "1" +req := opensearchapi.IndexRequest{ + Index: "go-test-index1", + DocumentID: docId, + Body: document, +} +insertResponse, err := req.Do(context.Background(), client) +``` +{% include copy.html %} + +## Performing bulk operations + +You can perform several operations at the same time by using the `Bulk` method of the client. The operations may be of the same type or of different types. + +```go +blk, err := client.Bulk( + strings.NewReader(` + { "index" : { "_index" : "go-test-index1", "_id" : "2" } } + { "title" : "Interstellar", "director" : "Christopher Nolan", "year" : "2014"} + { "create" : { "_index" : "go-test-index1", "_id" : "3" } } + { "title" : "Star Trek Beyond", "director" : "Justin Lin", "year" : "2015"} + { "update" : {"_id" : "3", "_index" : "go-test-index1" } } + { "doc" : {"year" : "2016"} } +`), + ) +``` +{% include copy.html %} + +## Searching for documents + +The easiest way to search for documents is to construct a query string. The following code uses a `multi_match` query to search for "miller" in the title and director fields. It boosts the documents where "miller" appears in the title field: + +```go +content := strings.NewReader(`{ + "size": 5, + "query": { + "multi_match": { + "query": "miller", + "fields": ["title^2", "director"] + } + } +}`) + +search := opensearchapi.SearchRequest{ + Index: []string{"go-test-index1"}, + Body: content, +} + +searchResponse, err := search.Do(context.Background(), client) +``` +{% include copy.html %} + +## Deleting a document + +You can delete a document using the `DeleteRequest` method: + +```go +delete := opensearchapi.DeleteRequest{ + Index: "go-test-index1", + DocumentID: "1", +} + +deleteResponse, err := delete.Do(context.Background(), client) +``` +{% include copy.html %} + +## Deleting an index + +You can delete an index using the `IndicesDeleteRequest` method: + +```go +deleteIndex := opensearchapi.IndicesDeleteRequest{ + Index: []string{"go-test-index1"}, +} + +deleteIndexResponse, err := deleteIndex.Do(context.Background(), client) +``` +{% include copy.html %} + +## Sample program + +The following sample program creates a client, adds an index with non-default settings, inserts a document, performs bulk operations, searches for the document, deletes the document, and then deletes the index: ```go package main @@ -58,21 +357,23 @@ func main() { // Print OpenSearch version information on console. fmt.Println(client.Info()) - // Define index mapping. - mapping := strings.NewReader(`{ + // Define index settings. + settings := strings.NewReader(`{ 'settings': { 'index': { - 'number_of_shards': 4 + 'number_of_shards': 1, + 'number_of_replicas': 2 } } }`) // Create an index with non-default settings. - res := opensearchapi.CreateRequest{ - Index: IndexName, - Body: mapping, + res := opensearchapi.IndicesCreateRequest{ + Index: IndexName, + Body: settings, } - fmt.Println("creating index", res) + fmt.Println("Creating index") + fmt.Println(res) // Add a document to the index. document := strings.NewReader(`{ @@ -92,7 +393,28 @@ func main() { fmt.Println("failed to insert document ", err) os.Exit(1) } + fmt.Println("Inserting a document") fmt.Println(insertResponse) + defer insertResponse.Body.Close() + + // Perform bulk operations. + blk, err := client.Bulk( + strings.NewReader(` + { "index" : { "_index" : "go-test-index1", "_id" : "2" } } + { "title" : "Interstellar", "director" : "Christopher Nolan", "year" : "2014"} + { "create" : { "_index" : "go-test-index1", "_id" : "3" } } + { "title" : "Star Trek Beyond", "director" : "Justin Lin", "year" : "2015"} + { "update" : {"_id" : "3", "_index" : "go-test-index1" } } + { "doc" : {"year" : "2016"} } +`), + ) + + if err != nil { + fmt.Println("failed to perform bulk operations", err) + os.Exit(1) + } + fmt.Println("Performing bulk operations") + fmt.Println(blk) // Search for the document. content := strings.NewReader(`{ @@ -106,6 +428,7 @@ func main() { }`) search := opensearchapi.SearchRequest{ + Index: []string{IndexName}, Body: content, } @@ -114,7 +437,9 @@ func main() { fmt.Println("failed to search document ", err) os.Exit(1) } + fmt.Println("Searching for a document") fmt.Println(searchResponse) + defer searchResponse.Body.Close() // Delete the document. delete := opensearchapi.DeleteRequest{ @@ -127,10 +452,11 @@ func main() { fmt.Println("failed to delete document ", err) os.Exit(1) } - fmt.Println("deleting document") + fmt.Println("Deleting a document") fmt.Println(deleteResponse) + defer deleteResponse.Body.Close() - // Delete previously created index. + // Delete the previously created index. deleteIndex := opensearchapi.IndicesDeleteRequest{ Index: []string{IndexName}, } @@ -140,6 +466,9 @@ func main() { fmt.Println("failed to delete index ", err) os.Exit(1) } - fmt.Println("deleting index", deleteIndexResponse) + fmt.Println("Deleting the index") + fmt.Println(deleteIndexResponse) + defer deleteIndexResponse.Body.Close() } ``` +{% include copy.html %} \ No newline at end of file diff --git a/_clients/index.md b/_clients/index.md index 2f3513dd..7ce0acc2 100644 --- a/_clients/index.md +++ b/_clients/index.md @@ -1,36 +1,64 @@ --- layout: default -title: Compatibility +title: Language clients nav_order: 1 has_children: false +nav_exclude: true +permalink: /clients/ redirect_from: - - /clients/ + - /clients/index/ --- -# OpenSearch client compatibility +# OpenSearch language clients -OpenSearch provides clients for several popular programming languages, with more coming. In general, clients are compatible with clusters running the same major version of OpenSearch (`major.minor.patch`). +OpenSearch provides clients in JavaScript, Python, Ruby, Java, PHP, .NET, Go and Rust. -For example, a 1.0.0 client works with an OpenSearch 1.1.0 cluster, but might not support any non-breaking API changes in OpenSearch 1.1.0. A 1.2.0 client works with the same cluster, but might allow you to pass unsupported options in certain functions. We recommend using the same version for both, but if your tests pass after a cluster upgrade, you don't necessarily need to upgrade your clients immediately. +## OpenSearch clients -{% comment %} -* [OpenSearch Java client]({{site.url}}{{site.baseurl}}/clients/java/) -{% endcomment %} -* [OpenSearch Python client]({{site.url}}{{site.baseurl}}/clients/python/) -* [OpenSearch JavaScript (Node.js) client]({{site.url}}{{site.baseurl}}/clients/javascript/) -* [OpenSearch Go client]({{site.url}}{{site.baseurl}}/clients/go/) +OpenSearch provides clients for the following programming languages and platforms: +* **Python** + * [OpenSearch high-level Python client]({{site.url}}{{site.baseurl}}/clients/python-high-level/) + * [OpenSearch low-level Python client]({{site.url}}{{site.baseurl}}/clients/python-low-level/) + * [`opensearch-py-ml` client]({{site.url}}{{site.baseurl}}/clients/opensearch-py-ml/) +* **Java** + * [OpenSearch Java client]({{site.url}}{{site.baseurl}}/clients/java/) +* **JavaScript** + * [OpenSearch JavaScript (Node.js) client]({{site.url}}{{site.baseurl}}/clients/javascript/index) +* **Go** + * [OpenSearch Go client]({{site.url}}{{site.baseurl}}/clients/go/) +* **Ruby** + * [OpenSearch Ruby client]({{site.url}}{{site.baseurl}}/clients/ruby/) +* **PHP** + * [OpenSearch PHP client]({{site.url}}{{site.baseurl}}/clients/php/) +* **.NET** + * [OpenSearch .NET clients]({{site.url}}{{site.baseurl}}/clients/dot-net/) +* **Rust** + * [OpenSearch Rust client]({{site.url}}{{site.baseurl}}/clients/rust/) +* **Hadoop** + * [OpenSearch Hadoop client](https://github.com/opensearch-project/opensearch-hadoop) + + +For a client compatibility matrix, see the COMPATIBILITY.md file in the client's repository. +{: .note} + +The OpenSearch Java high-level REST client will be deprecated starting with OpenSearch version 3.0.0 and will be removed in a future release. We recommend switching to the [Java client]({{site.url}}{{site.baseurl}}/clients/java/) instead. +{: .warning} ## Legacy clients Most clients that work with Elasticsearch OSS 7.10.2 *should* work with OpenSearch, but the latest versions of those clients might include license or version checks that artificially break compatibility. This page includes recommendations around which versions of those clients to use for best compatibility with OpenSearch. +For a client compatibility matrix, see the COMPATIBILITY.md file in the client's repository. +{: .note} + Client | Recommended version :--- | :--- -[Java low-level REST client](https://search.maven.org/artifact/org.elasticsearch.client/elasticsearch-rest-client/7.13.4/jar) | 7.13.4 -[Java high-level REST client](https://search.maven.org/artifact/org.elasticsearch.client/elasticsearch-rest-high-level-client/7.13.4/jar) | 7.13.4 -[Python Elasticsearch client](https://pypi.org/project/elasticsearch/7.13.4/) | 7.13.4 +[Elasticsearch Java low-level REST client](https://search.maven.org/artifact/org.elasticsearch.client/elasticsearch-rest-client/7.13.4/jar) | 7.13.4 +[Elasticsearch Java high-level REST client](https://search.maven.org/artifact/org.elasticsearch.client/elasticsearch-rest-high-level-client/7.13.4/jar) | 7.13.4 +[Elasticsearch Python client](https://pypi.org/project/elasticsearch/7.13.4/) | 7.13.4 [Elasticsearch Node.js client](https://www.npmjs.com/package/@elastic/elasticsearch/v/7.13.0) | 7.13.0 +[Elasticsearch Ruby client](https://rubygems.org/gems/elasticsearch/versions/7.13.0) | 7.13.0 If you test a legacy client and verify that it works, please [submit a PR](https://github.com/opensearch-project/documentation-website/pulls) and add it to this table. diff --git a/_clients/java-rest-high-level.md b/_clients/java-rest-high-level.md index d441a36d..e4364994 100644 --- a/_clients/java-rest-high-level.md +++ b/_clients/java-rest-high-level.md @@ -1,15 +1,15 @@ --- layout: default title: Java high-level REST client -nav_order: 60 +nav_order: 20 --- # Java high-level REST client -Although the OpenSearch Java high-level REST client is still usable, we recommend that you use the [OpenSearch Java client]({{site.url}}{{site.baseurl}}/clients/java/), which replaces the existing Java high-level REST client. -{: .note} +The OpenSearch Java high-level REST client is deprecated. Support will be removed in OpenSearch version 3.0.0. We recommend switching to the [Java client]({{site.url}}{{site.baseurl}}/clients/java/) instead. +{: .warning} -The OpenSearch Java high-level REST client lets you interact with your OpenSearch clusters and indices through Java methods and data structures rather than HTTP methods and JSON. +The OpenSearch Java high-level REST client lets you interact with your OpenSearch clusters and indexes through Java methods and data structures rather than HTTP methods and JSON. ## Setup @@ -25,7 +25,23 @@ To start using the OpenSearch Java high-level REST client, ensure that you have You can now start your OpenSearch cluster. The OpenSearch 1.x high-level REST client works with the 1.x versions of OpenSearch. -## Sample code +## Security + +Before using the REST client in your Java application, you must configure the application's truststore to connect to the Security plugin. If you are using self-signed certificates or demo configurations, you can use the following command to create a custom truststore and add in root authority certificates. + +If you're using certificates from a trusted Certificate Authority (CA), you don't need to configure the truststore. + +```bash +keytool -import -alias -keystore +``` + +You can now point your Java client to the truststore and set basic authentication credentials that can access a secure cluster (refer to the sample code below on how to do so). + +If you run into issues when configuring security, see [common issues]({{site.url}}{{site.baseurl}}/troubleshoot/index) and [troubleshoot TLS]({{site.url}}{{site.baseurl}}/troubleshoot/tls). + +## Sample program + +This code example uses basic credentials that come with the default OpenSearch configuration. If you’re using the OpenSearch Java high-level REST client with your own OpenSearch cluster, be sure to change the code to use your own credentials. ```java import org.apache.http.HttpHost; @@ -62,7 +78,7 @@ public class RESTClientSample { System.setProperty("javax.net.ssl.trustStorePassword", "password-to-keystore"); //Establish credentials to use basic authentication. - //Only for demo purposes. Do not specify your credentials in code. + //Only for demo purposes. Don't specify your credentials in code. final CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); credentialsProvider.setCredentials(AuthScope.ANY, diff --git a/_clients/java.md b/_clients/java.md index c51a664f..4c1e06a4 100644 --- a/_clients/java.md +++ b/_clients/java.md @@ -1,41 +1,93 @@ --- layout: default title: Java client -nav_order: 65 +nav_order: 30 --- # Java client -The OpenSearch Java client allows you to interact with your OpenSearch clusters through Java methods and data structures rather than HTTP methods and raw JSON. +The OpenSearch Java client allows you to interact with your OpenSearch clusters through Java methods and data structures rather than HTTP methods and raw JSON. For example, you can submit requests to your cluster using objects to create indexes, add data to documents, or complete some other operation using the client's built-in methods. For the client's complete API documentation and additional examples, see the [javadoc](https://www.javadoc.io/doc/org.opensearch.client/opensearch-java/latest/index.html). -For example, you can submit requests to your cluster using objects to create indices, add data to documents, or complete some other operation using the client's built-in methods. +This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [opensearch-java repo](https://github.com/opensearch-project/opensearch-java). -## Setup +## Installing the client using Apache HttpClient 5 Transport -To start using the OpenSearch Java client, ensure that you have the following dependency in your project's `pom.xml` file: +To start using the OpenSearch Java client, you need to provide a transport. The default `ApacheHttpClient5TransportBuilder` transport comes with the Java client. To use the OpenSearch Java client with the default transport, add it to your `pom.xml` file as a dependency: -``` +```xml org.opensearch.client opensearch-java - 0.1.0 + 2.8.1 + + + + org.apache.httpcomponents.client5 + httpclient5 + 5.2.1 ``` +{% include copy.html %} -If you're using Gradle, add the following dependencies to your project. +If you're using Gradle, add the following dependencies to your project: ``` dependencies { - implementation 'org.opensearch.client:opensearch-rest-client: {{site.opensearch_version}}' - implementation 'org.opensearch.client:opensearch-java:0.1.0' + implementation 'org.opensearch.client:opensearch-java:2.8.1' + implementation 'org.apache.httpcomponents.client5:httpclient5:5.2.1' } ``` +{% include copy.html %} You can now start your OpenSearch cluster. -The following example uses credentials that come with the default OpenSearch configuration. If you're using the OpenSearch Java client with your own OpenSearch cluster, be sure to change the code to use your own credentials. +## Installing the client using RestClient Transport -## Sample code +Alternatively, you can create a Java client by using the `RestClient`-based transport. In this case, make sure that you have the following dependencies in your project's `pom.xml` file: + +```xml + + org.opensearch.client + opensearch-rest-client + {{site.opensearch_version}} + + + + org.opensearch.client + opensearch-java + 2.6.0 + +``` +{% include copy.html %} + +If you're using Gradle, add the following dependencies to your project" + +``` +dependencies { + implementation 'org.opensearch.client:opensearch-rest-client:{{site.opensearch_version}}' + implementation 'org.opensearch.client:opensearch-java:2.6.0' +} +``` +{% include copy.html %} + +You can now start your OpenSearch cluster. + +## Security + +Before using the REST client in your Java application, you must configure the application's truststore to connect to the Security plugin. If you are using self-signed certificates or demo configurations, you can use the following command to create a custom truststore and add in root authority certificates. + +If you're using certificates from a trusted Certificate Authority (CA), you don't need to configure the truststore. + +```bash +keytool -import -alias -keystore +``` +{% include copy.html %} + +You can now point your Java client to the truststore and set basic authentication credentials that can access a secure cluster (refer to the sample code below on how to do so). + +If you run into issues when configuring security, see [common issues]({{site.url}}{{site.baseurl}}/troubleshoot/index) and [troubleshoot TLS]({{site.url}}{{site.baseurl}}/troubleshoot/tls). + +## Sample data This section uses a class called `IndexData`, which is a simple Java class that stores basic data and methods. For your own OpenSearch cluster, you might find that you need a more robust class to store your data. @@ -73,8 +125,233 @@ static class IndexData { } } ``` +{% include copy.html %} -### OpenSearch client example +## Initializing the client with SSL and TLS enabled using Apache HttpClient 5 Transport + +This code example uses basic credentials that come with the default OpenSearch configuration. If you’re using the Java client with your own OpenSearch cluster, be sure to change the code so that it uses your own credentials. + +The following sample code initializes a client with SSL and TLS enabled: + + +```java +import javax.net.ssl.SSLContext; +import javax.net.ssl.SSLEngine; + +import org.apache.hc.client5.http.auth.AuthScope; +import org.apache.hc.client5.http.auth.UsernamePasswordCredentials; +import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider; +import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManager; +import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManagerBuilder; +import org.apache.hc.client5.http.ssl.ClientTlsStrategyBuilder; +import org.apache.hc.core5.function.Factory; +import org.apache.hc.core5.http.HttpHost; +import org.apache.hc.core5.http.nio.ssl.TlsStrategy; +import org.apache.hc.core5.reactor.ssl.TlsDetails; +import org.apache.hc.core5.ssl.SSLContextBuilder; +import org.opensearch.client.opensearch.OpenSearchClient; +import org.opensearch.client.transport.OpenSearchTransport; +import org.opensearch.client.transport.httpclient5.ApacheHttpClient5TransportBuilder; + +public class OpenSearchClientExample { + public static void main(String[] args) throws Exception { + System.setProperty("javax.net.ssl.trustStore", "/full/path/to/keystore"); + System.setProperty("javax.net.ssl.trustStorePassword", "password-to-keystore"); + + final HttpHost host = new HttpHost("https", "localhost", 9200); + final BasicCredentialsProvider credentialsProvider = new BasicCredentialsProvider(); + // Only for demo purposes. Don't specify your credentials in code. + credentialsProvider.setCredentials(new AuthScope(host), new UsernamePasswordCredentials("admin", "admin".toCharArray())); + + final SSLContext sslcontext = SSLContextBuilder + .create() + .loadTrustMaterial(null, (chains, authType) -> true) + .build(); + + final ApacheHttpClient5TransportBuilder builder = ApacheHttpClient5TransportBuilder.builder(host); + builder.setHttpClientConfigCallback(httpClientBuilder -> { + final TlsStrategy tlsStrategy = ClientTlsStrategyBuilder.create() + .setSslContext(sslcontext) + // See https://issues.apache.org/jira/browse/HTTPCLIENT-2219 + .setTlsDetailsFactory(new Factory() { + @Override + public TlsDetails create(final SSLEngine sslEngine) { + return new TlsDetails(sslEngine.getSession(), sslEngine.getApplicationProtocol()); + } + }) + .build(); + + final PoolingAsyncClientConnectionManager connectionManager = PoolingAsyncClientConnectionManagerBuilder + .create() + .setTlsStrategy(tlsStrategy) + .build(); + + return httpClientBuilder + .setDefaultCredentialsProvider(credentialsProvider) + .setConnectionManager(connectionManager); + }); + + final OpenSearchTransport transport = builder.build(); + OpenSearchClient client = new OpenSearchClient(transport); + } +} + +``` + +## Initializing the client with SSL and TLS enabled using RestClient Transport + +This code example uses basic credentials that come with the default OpenSearch configuration. If you’re using the Java client with your own OpenSearch cluster, be sure to change the code so that it uses your own credentials. + +The following sample code initializes a client with SSL and TLS enabled: + +```java +import org.apache.http.HttpHost; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.impl.nio.client.HttpAsyncClientBuilder; +import org.apache.http.impl.client.BasicCredentialsProvider; +import org.opensearch.client.RestClient; +import org.opensearch.client.RestClientBuilder; +import org.opensearch.client.json.jackson.JacksonJsonpMapper; +import org.opensearch.client.opensearch.OpenSearchClient; +import org.opensearch.client.transport.OpenSearchTransport; +import org.opensearch.client.transport.rest_client.RestClientTransport; + +public class OpenSearchClientExample { + public static void main(String[] args) throws Exception { + System.setProperty("javax.net.ssl.trustStore", "/full/path/to/keystore"); + System.setProperty("javax.net.ssl.trustStorePassword", "password-to-keystore"); + + final HttpHost host = new HttpHost("https", "localhost", 9200); + final BasicCredentialsProvider credentialsProvider = new BasicCredentialsProvider(); + //Only for demo purposes. Don't specify your credentials in code. + credentialsProvider.setCredentials(new AuthScope(host), new UsernamePasswordCredentials("admin", "admin".toCharArray())); + + //Initialize the client with SSL and TLS enabled + final RestClient restClient = RestClient.builder(host). + setHttpClientConfigCallback(new RestClientBuilder.HttpClientConfigCallback() { + @Override + public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) { + return httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider); + } + }).build(); + + final OpenSearchTransport transport = new RestClientTransport(restClient, new JacksonJsonpMapper()); + final OpenSearchClient client = new OpenSearchClient(transport); + } +} +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Service + +The following example illustrates connecting to Amazon OpenSearch Service: + +```java +SdkHttpClient httpClient = ApacheHttpClient.builder().build(); + +OpenSearchClient client = new OpenSearchClient( + new AwsSdk2Transport( + httpClient, + "search-...us-west-2.es.amazonaws.com", // OpenSearch endpoint, without https:// + "es", + Region.US_WEST_2, // signing service region + AwsSdk2TransportOptions.builder().build() + ) +); + +InfoResponse info = client.info(); +System.out.println(info.version().distribution() + ": " + info.version().number()); + +httpClient.close(); +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Serverless + +The following example illustrates connecting to Amazon OpenSearch Serverless Service: + +```java +SdkHttpClient httpClient = ApacheHttpClient.builder().build(); + +OpenSearchClient client = new OpenSearchClient( + new AwsSdk2Transport( + httpClient, + "search-...us-west-2.aoss.amazonaws.com", // OpenSearch endpoint, without https:// + "aoss" + Region.US_WEST_2, // signing service region + AwsSdk2TransportOptions.builder().build() + ) +); + +InfoResponse info = client.info(); +System.out.println(info.version().distribution() + ": " + info.version().number()); + +httpClient.close(); +``` +{% include copy.html %} + + +## Creating an index + +You can create an index with non-default settings using the following code: + +```java +String index = "sample-index"; +CreateIndexRequest createIndexRequest = new CreateIndexRequest.Builder().index(index).build(); +client.indices().create(createIndexRequest); + +IndexSettings indexSettings = new IndexSettings.Builder().autoExpandReplicas("0-all").build(); +PutIndicesSettingsRequest putIndicesSettingsRequest = new PutIndicesSettingsRequest.Builder().index(index).value(indexSettings).build(); +client.indices().putSettings(putIndicesSettingsRequest); +``` +{% include copy.html %} + +## Indexing data + +You can index data into OpenSearch using the following code: + +```java +IndexData indexData = new IndexData("first_name", "Bruce"); +IndexRequest indexRequest = new IndexRequest.Builder().index(index).id("1").document(indexData).build(); +client.index(indexRequest); +``` +{% include copy.html %} + +## Searching for documents + +You can search for a document using the following code: + +```java +SearchResponse searchResponse = client.search(s -> s.index(index), IndexData.class); +for (int i = 0; i< searchResponse.hits().hits().size(); i++) { + System.out.println(searchResponse.hits().hits().get(i).source()); +} +``` +{% include copy.html %} + +## Deleting a document + +The following sample code deletes a document whose ID is 1: + +```java +client.delete(b -> b.index(index).id("1")); +``` +{% include copy.html %} + +### Deleting an index + +The following sample code deletes an index: + +```java +DeleteIndexRequest deleteIndexRequest = new DeleteIndexRequest.Builder().index(index).build(); +DeleteIndexResponse deleteIndexResponse = client.indices().delete(deleteIndexRequest); +``` +{% include copy.html %} + +## Sample program + +The following sample program creates a client, adds an index with non-default settings, inserts a document, searches for the document, deletes the document, and then deletes the index: ```java import org.apache.http.HttpHost; @@ -85,76 +362,75 @@ import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.nio.client.HttpAsyncClientBuilder; import org.opensearch.client.RestClient; import org.opensearch.client.RestClientBuilder; -import org.opensearch.clients.base.RestClientTransport; -import org.opensearch.clients.base.Transport; -import org.opensearch.clients.json.jackson.JacksonJsonpMapper; -import org.opensearch.clients.opensearch.OpenSearchClient; -import org.opensearch.clients.opensearch._global.IndexRequest; -import org.opensearch.clients.opensearch._global.IndexResponse; -import org.opensearch.clients.opensearch._global.SearchResponse; -import org.opensearch.clients.opensearch.indices.*; -import org.opensearch.clients.opensearch.indices.put_settings.IndexSettingsBody; +import org.opensearch.client.base.RestClientTransport; +import org.opensearch.client.base.Transport; +import org.opensearch.client.json.jackson.JacksonJsonpMapper; +import org.opensearch.client.opensearch.OpenSearchClient; +import org.opensearch.client.opensearch._global.IndexRequest; +import org.opensearch.client.opensearch._global.IndexResponse; +import org.opensearch.client.opensearch._global.SearchResponse; +import org.opensearch.client.opensearch.indices.*; +import org.opensearch.client.opensearch.indices.put_settings.IndexSettingsBody; import java.io.IOException; public class OpenSearchClientExample { public static void main(String[] args) { + RestClient restClient = null; try{ - System.setProperty("javax.net.ssl.trustStore", "/full/path/to/keystore"); - System.setProperty("javax.net.ssl.trustStorePassword", "password-to-keystore"); + System.setProperty("javax.net.ssl.trustStore", "/full/path/to/keystore"); + System.setProperty("javax.net.ssl.trustStorePassword", "password-to-keystore"); - //Only for demo purposes. Don't specify your credentials in code. - final CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); - credentialsProvider.setCredentials(AuthScope.ANY, - new UsernamePasswordCredentials("admin", "admin")); + //Only for demo purposes. Don't specify your credentials in code. + final CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); + credentialsProvider.setCredentials(AuthScope.ANY, + new UsernamePasswordCredentials("admin", "admin")); - //Initialize the client with SSL and TLS enabled - RestClient restClient = RestClient.builder(new HttpHost("localhost", 9200, "https")). - setHttpClientConfigCallback(new RestClientBuilder.HttpClientConfigCallback() { - @Override - public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) { - return httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider); - } - }).build(); - Transport transport = new RestClientTransport(restClient, new JacksonJsonpMapper()); - OpenSearchClient client = new OpenSearchClient(transport); + //Initialize the client with SSL and TLS enabled + restClient = RestClient.builder(new HttpHost("localhost", 9200, "https")). + setHttpClientConfigCallback(new RestClientBuilder.HttpClientConfigCallback() { + @Override + public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) { + return httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider); + } + }).build(); + Transport transport = new RestClientTransport(restClient, new JacksonJsonpMapper()); + OpenSearchClient client = new OpenSearchClient(transport); - //Create the index - String index = "sample-index"; - CreateRequest createIndexRequest = new CreateRequest.Builder().index(index).build(); - client.indices().create(createIndexRequest); + //Create the index + String index = "sample-index"; + CreateIndexRequest createIndexRequest = new CreateIndexRequest.Builder().index(index).build(); + client.indices().create(createIndexRequest); - //Add some settings to the index - IndexSettings indexSettings = new IndexSettings.Builder().autoExpandReplicas("0-all").build(); - IndexSettingsBody settingsBody = new IndexSettingsBody.Builder().settings(indexSettings).build(); - PutSettingsRequest putSettingsRequest = new PutSettingsRequest.Builder().index(index).value(settingsBody).build(); - client.indices().putSettings(putSettingsRequest); + //Add some settings to the index + IndexSettings indexSettings = new IndexSettings.Builder().autoExpandReplicas("0-all").build(); + IndexSettingsBody settingsBody = new IndexSettingsBody.Builder().settings(indexSettings).build(); + PutSettingsRequest putSettingsRequest = new PutSettingsRequest.Builder().index(index).value(settingsBody).build(); + client.indices().putSettings(putSettingsRequest); - //Index some data - IndexData indexData = new IndexData("first_name", "Bruce"); - IndexRequest indexRequest = new IndexRequest.Builder().index(index).id("1").value(indexData).build(); - client.index(indexRequest); + //Index some data + IndexData indexData = new IndexData("first_name", "Bruce"); + IndexRequest indexRequest = new IndexRequest.Builder().index(index).id("1").document(indexData).build(); + client.index(indexRequest); - //Search for the document - SearchResponse searchResponse = client.search(s -> s.index(index), IndexData.class); - for (int i = 0; i< searchResponse.hits().hits().size(); i++) { - System.out.println(searchResponse.hits().hits().get(i).source()); - } + //Search for the document + SearchResponse searchResponse = client.search(s -> s.index(index), IndexData.class); + for (int i = 0; i< searchResponse.hits().hits().size(); i++) { + System.out.println(searchResponse.hits().hits().get(i).source()); + } - //Delete the document - client.delete(b -> b.index(index).id("1")); + //Delete the document + client.delete(b -> b.index(index).id("1")); - // Delete the index - DeleteRequest deleteRequest = new DeleteRequest.Builder().index(index).build(); - DeleteResponse deleteResponse = client.indices().delete(deleteRequest); - - restClient.close(); + // Delete the index + DeleteIndexRequest deleteIndexRequest = new DeleteRequest.Builder().index(index).build(); + DeleteIndexResponse deleteIndexResponse = client.indices().delete(deleteIndexRequest); } catch (IOException e){ System.out.println(e.toString()); } finally { try { - if (client != null) { - client.close(); + if (restClient != null) { + restClient.close(); } } catch (IOException e) { System.out.println(e.toString()); @@ -163,3 +439,4 @@ public class OpenSearchClientExample { } } ``` +{% include copy.html %} diff --git a/_clients/javascript.md b/_clients/javascript.md deleted file mode 100644 index c670e4b8..00000000 --- a/_clients/javascript.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -layout: default -title: JavaScript client -nav_order: 90 ---- - -# JavaScript client - -The OpenSearch JavaScript client provides a safer and easier way to interact with your OpenSearch cluster. Rather than using OpenSearch from the browser and potentially exposing your data to the public, you can build an OpenSearch client that takes care of sending requests to your cluster. - -The client contains a library of APIs that let you perform different operations on your cluster and return a standard response body. The example here demonstrates some basic operations like creating an index, adding documents, and searching your data. - -## Setup - -To add the client to your project, install it from [npm](https://www.npmjs.com): - -```bash -npm install @opensearch-project/opensearch -``` - -To install a specific major version of the client, run the following command: - -```bash -npm install @opensearch-project/opensearch@ -``` - -If you prefer to add the client manually or just want to examine the source code, see [opensearch-js](https://github.com/opensearch-project/opensearch-js) on GitHub. - -Then require the client: - -```javascript -const { Client } = require("@opensearch-project/opensearch"); -``` - -## Sample code - -```javascript -"use strict"; - -var host = "localhost"; -var protocol = "https"; -var port = 9200; -var auth = "admin:admin"; // For testing only. Don't store credentials in code. -var ca_certs_path = "/full/path/to/root-ca.pem"; - -// Optional client certificates if you don't want to use HTTP basic authentication. -// var client_cert_path = '/full/path/to/client.pem' -// var client_key_path = '/full/path/to/client-key.pem' - -// Create a client with SSL/TLS enabled. -var { Client } = require("@opensearch-project/opensearch"); -var fs = require("fs"); -var client = new Client({ - node: protocol + "://" + auth + "@" + host + ":" + port, - ssl: { - ca: fs.readFileSync(ca_certs_path), - // You can turn off certificate verification (rejectUnauthorized: false) if you're using self-signed certificates with a hostname mismatch. - // cert: fs.readFileSync(client_cert_path), - // key: fs.readFileSync(client_key_path) - }, -}); - -async function search() { - // Create an index with non-default settings. - var index_name = "books"; - var settings = { - settings: { - index: { - number_of_shards: 4, - number_of_replicas: 3, - }, - }, - }; - - var response = await client.indices.create({ - index: index_name, - body: settings, - }); - - console.log("Creating index:"); - console.log(response.body); - - // Add a document to the index. - var document = { - title: "The Outsider", - author: "Stephen King", - year: "2018", - genre: "Crime fiction", - }; - - var id = "1"; - - var response = await client.index({ - id: id, - index: index_name, - body: document, - refresh: true, - }); - - console.log("Adding document:"); - console.log(response.body); - - // Search for the document. - var query = { - query: { - match: { - title: { - query: "The Outsider", - }, - }, - }, - }; - - var response = await client.search({ - index: index_name, - body: query, - }); - - console.log("Search results:"); - console.log(response.body.hits); - - // Delete the document. - var response = await client.delete({ - index: index_name, - id: id, - }); - - console.log("Deleting document:"); - console.log(response.body); - - // Delete the index. - var response = await client.indices.delete({ - index: index_name, - }); - - console.log("Deleting index:"); - console.log(response.body); -} - -search().catch(console.log); -``` diff --git a/_clients/javascript/helpers.md b/_clients/javascript/helpers.md new file mode 100644 index 00000000..f88efd8e --- /dev/null +++ b/_clients/javascript/helpers.md @@ -0,0 +1,205 @@ +--- +layout: default +title: Helper methods +parent: JavaScript client +nav_order: 2 +--- + +# Helper methods + +Helper methods simplify the use of complicated API tasks. For the client's complete API documentation and additional examples, see the [JS client API documentation](https://opensearch-project.github.io/opensearch-js/2.2/index.html). + +## Bulk helper + +The bulk helper simplifies making complex bulk API requests. The bulk helper supports operations of the same kind. Alternatively, you can use the `client.bulk` method to perform multiple types of bulk operations. For example, you can send `delete` and `index` operations in one bulk request. For more information, see the [Bulk guide](https://github.com/opensearch-project/opensearch-js/blob/main/guides/bulk.md). + +### Usage + +The following code creates a bulk helper instance: + +```javascript +const { Client } = require('@opensearch-project/opensearch') +const documents = require('./docs.json') + +const client = new Client({ ... }) + +const result = await client.helpers.bulk({ + datasource: documents, + onDocument (doc) { + return { + index: { _index: 'example-index' } + } + } +}) + +console.log(result) +``` +{% include copy.html %} + +Bulk helper operations return an object with the following fields: + +```json +{ + total: number, + failed: number, + retry: number, + successful: number, + time: number, + bytes: number, + aborted: boolean +} +``` + +#### Bulk helper configuration options + +When creating a new bulk helper instance, you can use the following configuration options. + +| Option | Data type | Required/Default | Description +| :--- | :--- | :--- | :--- +| `datasource` | An array, async generator or a readable stream of strings or objects | Required | Represents the documents you need to create, delete, index, or update. +| `onDocument` | Function | Required | A function to be invoked with each document in the given `datasource`. It returns the operation to be executed for this document. Optionally, the document can be manipulated for `create` and `index` operations by returning a new document as part of the function's result. +| `concurrency` | Integer | Optional. Default is 5. | The number of requests to be executed in parallel. +| `flushBytes` | Integer | Optional. Default is 5,000,000. | Maximum bulk body size to send in bytes. +| `flushInterval` | Integer | Optional. Default is 30,000. | Time in milliseconds to wait before flushing the body after the last document has been read. +| `onDrop` | Function | Optional. Default is `noop`. | A function to be invoked for every document that can’t be indexed after reaching the maximum number of retries. +| `refreshOnCompletion` | Boolean | Optional. Default is false. | Whether or not a refresh should be run on all affected indexes at the end of the bulk operation. +| `retries` | Integer | Optional. Defaults to the client's `maxRetries` value. | The number of times an operation is retried before `onDrop` is called for that document. +| `wait` | Integer | Optional. Default is 5,000. | Time in milliseconds to wait before retrying an operation. + +### Examples + +The following examples illustrate the index, create, update, and delete bulk helper operations. For more information and advanced index actions, see the [`opensearch-js` guides](https://github.com/opensearch-project/opensearch-js/tree/main/guides) in GitHub. + +#### Index + +The index operation creates a new document if it doesn’t exist and recreates the document if it already exists. + +The following bulk operation indexes documents into `example-index`: + +```javascript +client.helpers.bulk({ + datasource: arrayOfDocuments, + onDocument (doc) { + return { + index: { _index: 'example-index' } + } + } +}) +``` +{% include copy.html %} + +The following bulk operation indexes documents into `example-index` with document overwrite: + +```javascript +client.helpers.bulk({ + datasource: arrayOfDocuments, + onDocument (doc) { + return [ + { + index: { _index: 'example-index' } + }, + { ...doc, createdAt: new Date().toISOString() } + ] + } +}) +``` +{% include copy.html %} + +#### Create + +The create operation creates a new document only if the document does not already exist. + +The following bulk operation creates documents in the `example-index`: + +```javascript +client.helpers.bulk({ + datasource: arrayOfDocuments, + onDocument (doc) { + return { + create: { _index: 'example-index', _id: doc.id } + } + } +}) +``` +{% include copy.html %} + +The following bulk operation creates documents in the `example-index` with document overwrite: + +```javascript +client.helpers.bulk({ + datasource: arrayOfDocuments, + onDocument (doc) { + return [ + { + create: { _index: 'example-index', _id: doc.id } + }, + { ...doc, createdAt: new Date().toISOString() } + ] + } +}) +``` +{% include copy.html %} + +#### Update + +The update operation updates the document with the fields being sent. The document must already exist in the index. + +The following bulk operation updates documents in the `arrayOfDocuments`: + +```javascript +client.helpers.bulk({ + datasource: arrayOfDocuments, + onDocument (doc) { + // The update operation always requires a tuple to be returned, with the + // first element being the action and the second being the update options. + return [ + { + update: { _index: 'example-index', _id: doc.id } + }, + { doc_as_upsert: true } + ] + } +}) +``` +{% include copy.html %} + +The following bulk operation updates documents in the `arrayOfDocuments` with document overwrite: + +```javascript +client.helpers.bulk({ + datasource: arrayOfDocuments, + onDocument (doc) { + return [ + { + update: { _index: 'example-index', _id: doc.id } + }, + { + doc: { ...doc, createdAt: new Date().toISOString() }, + doc_as_upsert: true + } + ] + } +}) +``` +{% include copy.html %} + +#### Delete + +The delete operation deletes a document. + +The following bulk operation deletes documents from the `example-index`: + +```javascript +client.helpers.bulk({ + datasource: arrayOfDocuments, + onDocument (doc) { + return { + delete: { _index: 'example-index', _id: doc.id } + } + } +}) +``` +{% include copy.html %} + +## Related articles +https://github.com/opensearch-project/opensearch-js/tree/main/guides diff --git a/_clients/javascript/index.md b/_clients/javascript/index.md new file mode 100644 index 00000000..58e9f190 --- /dev/null +++ b/_clients/javascript/index.md @@ -0,0 +1,569 @@ +--- +layout: default +title: JavaScript client +has_children: true +nav_order: 40 +redirect_from: + - /clients/javascript/ +--- + +# JavaScript client + +The OpenSearch JavaScript (JS) client provides a safer and easier way to interact with your OpenSearch cluster. Rather than using OpenSearch from the browser and potentially exposing your data to the public, you can build an OpenSearch client that takes care of sending requests to your cluster. For the client's complete API documentation and additional examples, see the [JS client API documentation](https://opensearch-project.github.io/opensearch-js/2.2/index.html). + +The client contains a library of APIs that let you perform different operations on your cluster and return a standard response body. The example here demonstrates some basic operations like creating an index, adding documents, and searching your data. + +You can use helper methods to simplify the use of complicated API tasks. For more information, see [Helper methods]({{site.url}}{{site.baseurl}}/clients/javascript/helpers/). For more advanced index actions, see the [`opensearch-js` guides](https://github.com/opensearch-project/opensearch-js/tree/main/guides) in GitHub. + +## Setup + +To add the client to your project, install it from [`npm`](https://www.npmjs.com): + +```bash +npm install @opensearch-project/opensearch +``` +{% include copy.html %} + +To install a specific major version of the client, run the following command: + +```bash +npm install @opensearch-project/opensearch@ +``` +{% include copy.html %} + +If you prefer to add the client manually or only want to examine the source code, see [`opensearch-js`](https://github.com/opensearch-project/opensearch-js) on GitHub. + +Then require the client: + +```javascript +const { Client } = require("@opensearch-project/opensearch"); +``` +{% include copy.html %} + +## Connecting to OpenSearch + +To connect to the default OpenSearch host, create a client object with the address `https://localhost:9200` if you are using the Security plugin: + +```javascript +var host = "localhost"; +var protocol = "https"; +var port = 9200; +var auth = "admin:"; // For testing only. Don't store credentials in code. +var ca_certs_path = "/full/path/to/root-ca.pem"; + +// Optional client certificates if you don't want to use HTTP basic authentication. +// var client_cert_path = '/full/path/to/client.pem' +// var client_key_path = '/full/path/to/client-key.pem' + +// Create a client with SSL/TLS enabled. +var { Client } = require("@opensearch-project/opensearch"); +var fs = require("fs"); +var client = new Client({ + node: protocol + "://" + auth + "@" + host + ":" + port, + ssl: { + ca: fs.readFileSync(ca_certs_path), + // You can turn off certificate verification (rejectUnauthorized: false) if you're using + // self-signed certificates with a hostname mismatch. + // cert: fs.readFileSync(client_cert_path), + // key: fs.readFileSync(client_key_path) + }, +}); +``` +{% include copy.html %} + +If you are not using the Security plugin, create a client object with the address `http://localhost:9200`: + +```javascript +var host = "localhost"; +var protocol = "http"; +var port = 9200; + +// Create a client +var { Client } = require("@opensearch-project/opensearch"); +var client = new Client({ + node: protocol + "://" + host + ":" + port +}); +``` +{% include copy.html %} + +## Authenticating with Amazon OpenSearch Service: AWS Signature Version 4 + +Use the following code to authenticate with AWS V2 SDK: + +```javascript +const AWS = require('aws-sdk'); // V2 SDK. +const { Client } = require('@opensearch-project/opensearch'); +const { AwsSigv4Signer } = require('@opensearch-project/opensearch/aws'); + +const client = new Client({ + ...AwsSigv4Signer({ + region: 'us-west-2', + service: 'es', + // Must return a Promise that resolve to an AWS.Credentials object. + // This function is used to acquire the credentials when the client start and + // when the credentials are expired. + // The Client will refresh the Credentials only when they are expired. + // With AWS SDK V2, Credentials.refreshPromise is used when available to refresh the credentials. + + // Example with AWS SDK V2: + getCredentials: () => + new Promise((resolve, reject) => { + // Any other method to acquire a new Credentials object can be used. + AWS.config.getCredentials((err, credentials) => { + if (err) { + reject(err); + } else { + resolve(credentials); + } + }); + }), + }), + node: 'https://search-xxx.region.es.amazonaws.com', // OpenSearch domain URL +}); +``` +{% include copy.html %} + +Use the following code to authenticate with the AWS V2 SDK for Amazon OpenSearch Serverless: + +```javascript +const AWS = require('aws-sdk'); // V2 SDK. +const { Client } = require('@opensearch-project/opensearch'); +const { AwsSigv4Signer } = require('@opensearch-project/opensearch/aws'); + +const client = new Client({ + ...AwsSigv4Signer({ + region: 'us-west-2', + service: 'aoss', + // Must return a Promise that resolve to an AWS.Credentials object. + // This function is used to acquire the credentials when the client start and + // when the credentials are expired. + // The Client will refresh the Credentials only when they are expired. + // With AWS SDK V2, Credentials.refreshPromise is used when available to refresh the credentials. + + // Example with AWS SDK V2: + getCredentials: () => + new Promise((resolve, reject) => { + // Any other method to acquire a new Credentials object can be used. + AWS.config.getCredentials((err, credentials) => { + if (err) { + reject(err); + } else { + resolve(credentials); + } + }); + }), + }), + node: "https://xxx.region.aoss.amazonaws.com" // OpenSearch domain URL +}); +``` +{% include copy.html %} + +Use the following code to authenticate with AWS V3 SDK: + +```javascript +const { defaultProvider } = require('@aws-sdk/credential-provider-node'); // V3 SDK. +const { Client } = require('@opensearch-project/opensearch'); +const { AwsSigv4Signer } = require('@opensearch-project/opensearch/aws'); + +const client = new Client({ + ...AwsSigv4Signer({ + region: 'us-east-1', + service: 'es', // 'aoss' for OpenSearch Serverless + // Must return a Promise that resolve to an AWS.Credentials object. + // This function is used to acquire the credentials when the client start and + // when the credentials are expired. + // The Client will refresh the Credentials only when they are expired. + // With AWS SDK V2, Credentials.refreshPromise is used when available to refresh the credentials. + + // Example with AWS SDK V3: + getCredentials: () => { + // Any other method to acquire a new Credentials object can be used. + const credentialsProvider = defaultProvider(); + return credentialsProvider(); + }, + }), + node: 'https://search-xxx.region.es.amazonaws.com', // OpenSearch domain URL + // node: "https://xxx.region.aoss.amazonaws.com" for OpenSearch Serverless +}); +``` +{% include copy.html %} + +Use the following code to authenticate with the AWS V3 SDK for Amazon OpenSearch Serverless: + +```javascript +const { defaultProvider } = require('@aws-sdk/credential-provider-node'); // V3 SDK. +const { Client } = require('@opensearch-project/opensearch'); +const { AwsSigv4Signer } = require('@opensearch-project/opensearch/aws'); + +const client = new Client({ + ...AwsSigv4Signer({ + region: 'us-east-1', + service: 'aoss', + // Must return a Promise that resolve to an AWS.Credentials object. + // This function is used to acquire the credentials when the client start and + // when the credentials are expired. + // The Client will refresh the Credentials only when they are expired. + // With AWS SDK V2, Credentials.refreshPromise is used when available to refresh the credentials. + + // Example with AWS SDK V3: + getCredentials: () => { + // Any other method to acquire a new Credentials object can be used. + const credentialsProvider = defaultProvider(); + return credentialsProvider(); + }, + }), + node: "https://xxx.region.aoss.amazonaws.com" // OpenSearch domain URL +}); +``` +{% include copy.html %} + +### Authenticating from within an AWS Lambda function + +Within an AWS Lambda function, objects declared outside the handler function retain their initialization. For more information, see [Lambda Execution Environment](https://docs.aws.amazon.com/lambda/latest/dg/lambda-runtime-environment.html). Thus, you must initialize the OpenSearch client outside of the handler function to ensure the reuse of the original connection in subsequent invocations. This promotes efficiency and eliminates the need to create a new connection each time. + +Initializing the client within the handler function poses a potential risk of encountering a `ConnectionError: getaddrinfo EMFILE error`. This error occurs when multiple connections are created in subsequent invocations, exceeding the system's file descriptor limit. + +The following example AWS Lambda function code demonstrates the correct initialization of the OpenSearch client: + +```javascript +const { defaultProvider } = require('@aws-sdk/credential-provider-node'); // V3 SDK. +const { Client } = require('@opensearch-project/opensearch'); +const { AwsSigv4Signer } = require('@opensearch-project/opensearch/aws'); + +const client = new Client({ + ...AwsSigv4Signer({ + region: 'us-east-1', + service: 'es', // 'aoss' for OpenSearch Serverless + // Must return a Promise that resolve to an AWS.Credentials object. + // This function is used to acquire the credentials when the client start and + // when the credentials are expired. + // The Client will refresh the Credentials only when they are expired. + // With AWS SDK V2, Credentials.refreshPromise is used when available to refresh the credentials. + + // Example with AWS SDK V3: + getCredentials: () => { + // Any other method to acquire a new Credentials object can be used. + const credentialsProvider = defaultProvider(); + return credentialsProvider(); + }, + }), + node: 'https://search-xxx.region.es.amazonaws.com', // OpenSearch domain URL + // node: "https://xxx.region.aoss.amazonaws.com" for OpenSearch Serverless +}); + +export const handler = async (event, context) => { + const indexName = "books"; + + const settings = { + settings: { + index: { + number_of_shards: 4, + number_of_replicas: 3, + }, + }, + }; + + // Use the already initialized client + const response = await client.indices.create({ + index: indexName, + body: settings, + }); + +}; +``` +{% include copy.html %} + + +## Creating an index + +To create an OpenSearch index, use the `indices.create()` method. You can use the following code to construct a JSON object with custom settings: + +```javascript +var index_name = "books"; + +var settings = { + settings: { + index: { + number_of_shards: 4, + number_of_replicas: 3, + }, + }, +}; + +var response = await client.indices.create({ + index: index_name, + body: settings, +}); +``` +{% include copy.html %} + +## Indexing a document + +You can index a document into OpenSearch using the client's `index` method: + +```javascript +var document = { + title: "The Outsider", + author: "Stephen King", + year: "2018", + genre: "Crime fiction", +}; + +var id = "1"; + +var response = await client.index({ + id: id, + index: index_name, + body: document, + refresh: true, +}); +``` +{% include copy.html %} + +## Searching for documents + +The easiest way to search for documents is to construct a query string. The following code uses a `match` query to search for "The Outsider" in the title field: + +```javascript +var query = { + query: { + match: { + title: { + query: "The Outsider", + }, + }, + }, +}; + +var response = await client.search({ + index: index_name, + body: query, +}); +``` +{% include copy.html %} + +## Updating a document + +You can update a document using the client's `update` method: + +```javascript +var response = await client.update({ + index: index_name, + id: id, + body: { + doc: { + // Specify the fields and their updated values here + field1: "new_value1", + field2: "new_value2", + // Add more fields as needed + } + } +}); +``` +{% include copy.html %} + +For example, the following code updates the `genre` field and adds a `tv_adapted` field to the document specified by `id`: + +```javascript +var response = await client.update({ + index: index_name, + id: id, + body: { + doc: { + genre: "Detective fiction", + tv_adapted: true + } + }, + refresh: true + }); +``` +{% include copy.html %} + +## Deleting a document + +You can delete a document using the client's `delete` method: + +```javascript +var response = await client.delete({ + index: index_name, + id: id, +}); +``` +{% include copy.html %} + +## Deleting an index + +You can delete an index using the `indices.delete()` method: + +```javascript +var response = await client.indices.delete({ + index: index_name, +}); +``` +{% include copy.html %} + +## Sample program + +The following sample program creates a client, adds an index with non-default settings, inserts a document, searches for the document, deletes the document, and then deletes the index: + +```javascript +"use strict"; + +var host = "localhost"; +var protocol = "https"; +var port = 9200; +var auth = "admin:"; // For testing only. Don't store credentials in code. +var ca_certs_path = "/full/path/to/root-ca.pem"; + +// Optional client certificates if you don't want to use HTTP basic authentication +// var client_cert_path = '/full/path/to/client.pem' +// var client_key_path = '/full/path/to/client-key.pem' + +// Create a client with SSL/TLS enabled +var { Client } = require("@opensearch-project/opensearch"); +var fs = require("fs"); +var client = new Client({ + node: protocol + "://" + auth + "@" + host + ":" + port, + ssl: { + ca: fs.readFileSync(ca_certs_path), + // You can turn off certificate verification (rejectUnauthorized: false) if you're using + // self-signed certificates with a hostname mismatch. + // cert: fs.readFileSync(client_cert_path), + // key: fs.readFileSync(client_key_path) + }, +}); + +async function search() { + // Create an index with non-default settings + var index_name = "books"; + + var settings = { + settings: { + index: { + number_of_shards: 4, + number_of_replicas: 3, + }, + }, + }; + + var response = await client.indices.create({ + index: index_name, + body: settings, + }); + + console.log("Creating index:"); + console.log(response.body); + + // Add a document to the index + var document = { + title: "The Outsider", + author: "Stephen King", + year: "2018", + genre: "Crime fiction", + }; + + var id = "1"; + + var response = await client.index({ + id: id, + index: index_name, + body: document, + refresh: true, + }); + + console.log("Adding document:"); + console.log(response.body); + + // Search for the document + var query = { + query: { + match: { + title: { + query: "The Outsider", + }, + }, + }, + }; + + var response = await client.search({ + index: index_name, + body: query, + }); + + console.log("Search results:"); + console.log(JSON.stringify(response.body.hits, null, " ")); + + // Update a document + var response = await client.update({ + index: index_name, + id: id, + body: { + doc: { + genre: "Detective fiction", + tv_adapted: true + } + }, + refresh: true + }); + + // Search for the updated document + var query = { + query: { + match: { + title: { + query: "The Outsider", + }, + }, + }, + }; + + var response = await client.search({ + index: index_name, + body: query, + }); + + console.log("Search results:"); + console.log(JSON.stringify(response.body.hits, null, " ")); + + // Delete the document + var response = await client.delete({ + index: index_name, + id: id, + }); + + console.log("Deleting document:"); + console.log(response.body); + + // Delete the index + var response = await client.indices.delete({ + index: index_name, + }); + + console.log("Deleting index:"); + console.log(response.body); +} + +search().catch(console.log); +``` +{% include copy.html %} + +## Circuit breaker + +The `memoryCircuitBreaker` option can be used to prevent errors caused by a response payload being too large to fit into the heap memory available to the client. + +The `memoryCircuitBreaker` object contains two fields: + +- `enabled`: A Boolean used to turn the circuit breaker on or off. Defaults to `false`. +- `maxPercentage`: The threshold that determines whether the circuit breaker engages. Valid values are floats in the [0, 1] range that represent percentages in decimal form. Any value that exceeds that range will correct to `1.0`. + +The following example instantiates a client with the circuit breaker enabled and its threshold set to 80% of the available heap size limit: + +```javascript +var client = new Client({ + memoryCircuitBreaker: { + enabled: true, + maxPercentage: 0.8, + }, +}); +``` +{% include copy.html %} diff --git a/_clients/logstash/ship-to-opensearch.md b/_clients/logstash/ship-to-opensearch.md deleted file mode 100644 index e9ef7260..00000000 --- a/_clients/logstash/ship-to-opensearch.md +++ /dev/null @@ -1,77 +0,0 @@ ---- -layout: default -title: Ship events to OpenSearch -parent: Logstash -nav_order: 220 ---- - -# Ship events to OpenSearch - -You can Ship Logstash events to an OpenSearch cluster and then visualize your events with OpenSearch Dashboards. - -Make sure you have [Logstash]({{site.url}}{{site.baseurl}}/clients/logstash/index/#install-logstash), [OpenSearch]({{site.url}}{{site.baseurl}}/opensearch/install/index/), and [OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/install/index/). -{: .note } - -## OpenSearch output plugin - -To run the OpenSearch output plugin, add the following configuration in your `pipeline.conf` file: - -```yml -output { - opensearch { - hosts => "https://localhost:9200" - user => "admin" - password => "admin" - index => "logstash-logs-%{+YYYY.MM.dd}" - ssl_certificate_verification => false - } -} -``` - - -## Sample walkthrough - -1. Open the `config/pipeline.conf` file and add in the following configuration: - - ```yml - input { - stdin { - codec => json - } - } - - output { - opensearch { - hosts => "https://localhost:9200" - user => "admin" - password => "admin" - index => "logstash-logs-%{+YYYY.MM.dd}" - ssl_certificate_verification => false - } - } - ``` - - This Logstash pipeline accepts JSON input through the terminal and ships the events to an OpenSearch cluster running locally. Logstash writes the events to an index with the `logstash-logs-%{+YYYY.MM.dd}` naming convention. - -2. Start Logstash: - - ```bash - $ bin/logstash -f config/pipeline.conf --config.reload.automatic - ``` - - `config/pipeline.conf` is a relative path to the `pipeline.conf` file. You can use an absolute path as well. - -3. Add a JSON object in the terminal: - - ```json - { "amount": 10, "quantity": 2} - ``` - -4. Start OpenSearch Dashboards and choose **Dev Tools**: - - ```json - GET _cat/indices?v - - health | status | index | uuid | pri | rep | docs.count | docs.deleted | store.size | pri.store.size - green | open | logstash-logs-2021.07.01 | iuh648LYSnmQrkGf70pplA | 1 | 1 | 1 | 0 | 10.3kb | 5.1kb - ``` diff --git a/_clients/opensearch-py-ml.md b/_clients/opensearch-py-ml.md new file mode 100644 index 00000000..de3704b9 --- /dev/null +++ b/_clients/opensearch-py-ml.md @@ -0,0 +1,44 @@ +--- +layout: default +title: Opensearch-py-ml +nav_order: 11 +--- + +# opensearch-py-ml + +`opensearch-py-ml` is a Python client that provides a suite of data analytics and natural language processing (NLP) support tools for OpenSearch. It provides data analysts with the ability to: + +- Call OpenSearch indexes and manipulate them using the opensearch-py-ml [DataFrame](https://opensearch-project.github.io/opensearch-py-ml/reference/dataframe.html) APIs. The opensearch-py-ml DataFrame wraps an OpenSearch index into an API similar to [pandas](https://pandas.pydata.org/), giving you the ability to process large amounts of data from OpenSearch inside a Jupyter Notebook. +- Upload NLP [SentenceTransformer](https://www.sbert.net/) models into OpenSearch using the [ML Commons plugin]({{site.url}}{{site.baseurl}}/ml-commons-plugin/index/). +- Train and tune SentenceTransformer models with synthetic queries. + +## Prerequisites + +To use `opensearch-py-ml`, install the [OpenSearch Python client]({{site.url}}{{site.baseurl}}/clients/python-low-level#setup). The Python client allows OpenSearch to use the Python syntax required to run DataFrames in `opensearch-py-ml`. + +## Install `opensearch-py-ml` + +To add the client to your project, install it using [pip](https://pip.pypa.io/): + +```bash +pip install opensearch-py-ml +``` +{% include copy.html %} + +Then import the client into OpenSearch like any other module: + +```python +from opensearchpy import OpenSearch +import opensearch_py_ml as oml +``` +{% include copy.html %} + +## API reference + +For information on all opensearch-py-ml objects, functions, and methods, see the [opensearch-py-ml API reference](https://opensearch-project.github.io/opensearch-py-ml/reference/index.html). + +## Next steps + +If you want to track or contribute to the development of the `opensearch-py-ml` client, see the [opensearch-py-ml GitHub repository](https://github.com/opensearch-project/opensearch-py-ml). + +For example Python notebooks to use with the client, see [Examples](https://opensearch-project.github.io/opensearch-py-ml/examples/index.html). diff --git a/_clients/php.md b/_clients/php.md new file mode 100644 index 00000000..785614c2 --- /dev/null +++ b/_clients/php.md @@ -0,0 +1,251 @@ +--- +layout: default +title: PHP client +nav_order: 70 +--- + +# PHP client + +The OpenSearch PHP client provides a safer and easier way to interact with your OpenSearch cluster. Rather than using OpenSearch from a browser and potentially exposing your data to the public, you can build an OpenSearch client that takes care of sending requests to your cluster. The client contains a library of APIs that let you perform different operations on your cluster and return a standard response body. + +This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [opensearch-php repo](https://github.com/opensearch-project/opensearch-php). + +## Setup + +To add the client to your project, install it using [composer](https://getcomposer.org/): + +```bash +composer require opensearch-project/opensearch-php +``` +{% include copy.html %} + +To install a specific major version of the client, run the following command: + +```bash +composer require opensearch-project/opensearch-php: +``` +{% include copy.html %} + +Then require the autload file from composer in your code: + +```php +require __DIR__ . '/vendor/autoload.php'; +``` +{% include copy.html %} + +## Connecting to OpenSearch + +To connect to the default OpenSearch host, create a client object with the address `https://localhost:9200` if you are using the Security plugin: + +```php +$client = (new \OpenSearch\ClientBuilder()) + ->setHosts(['https://localhost:9200']) + ->setBasicAuthentication('admin', 'admin') // For testing only. Don't store credentials in code. + ->setSSLVerification(false) // For testing only. Use certificate for validation + ->build(); +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Service + +The following example illustrates connecting to Amazon OpenSearch Service: + +```php +$client = (new \OpenSearch\ClientBuilder()) + ->setSigV4Region('us-east-2') + + ->setSigV4Service('es') + + // Default credential provider. + ->setSigV4CredentialProvider(true) + + // Using a custom access key and secret + ->setSigV4CredentialProvider([ + 'key' => 'awskeyid', + 'secret' => 'awssecretkey', + ]) + + ->build(); +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Serverless + +The following example illustrates connecting to Amazon OpenSearch Serverless Service: + +```php +$client = (new \OpenSearch\ClientBuilder()) + ->setSigV4Region('us-east-2') + + ->setSigV4Service('aoss') + + // Default credential provider. + ->setSigV4CredentialProvider(true) + + // Using a custom access key and secret + ->setSigV4CredentialProvider([ + 'key' => 'awskeyid', + 'secret' => 'awssecretkey', + ]) + + ->build(); +``` +{% include copy.html %} + + +## Creating an index + +To create an OpenSearch index with custom settings, use the following code: + +```php +$indexName = 'test-index-name'; + +// Create an index with non-default settings. +$client->indices()->create([ + 'index' => $indexName, + 'body' => [ + 'settings' => [ + 'index' => [ + 'number_of_shards' => 4 + ] + ] + ] +]); +``` +{% include copy.html %} + +## Indexing a document + +You can index a document into OpenSearch using the following code: + +```php +$client->create([ + 'index' => $indexName, + 'id' => 1, + 'body' => [ + 'title' => 'Moneyball', + 'director' => 'Bennett Miller', + 'year' => 2011 + ] +]); +``` +{% include copy.html %} + +## Searching for documents + +The following code uses a `multi_match` query to search for "miller" in the title and director fields. It boosts the documents where "miller" appears in the title field: + +```php +var_dump( + $client->search([ + 'index' => $indexName, + 'body' => [ + 'size' => 5, + 'query' => [ + 'multi_match' => [ + 'query' => 'miller', + 'fields' => ['title^2', 'director'] + ] + ] + ] + ]) +); +``` +{% include copy.html %} + +## Deleting a document + +You can delete a document using the following code: + +```php +$client->delete([ + 'index' => $indexName, + 'id' => 1, +]); +``` +{% include copy.html %} + +## Deleting an index + +You can delete an index using the following code: + +```php +$client->indices()->delete([ + 'index' => $indexName +]); +``` +{% include copy.html %} + +## Sample program + +The following sample program creates a client, adds an index with non-default settings, inserts a document, searches for the document, deletes the document, and then deletes the index: + +```php +setHosts(['https://localhost:9200']) + ->setBasicAuthentication('admin', 'admin') // For testing only. Don't store credentials in code. + ->setSSLVerification(false) // For testing only. Use certificate for validation + ->build(); + +$indexName = 'test-index-name'; + +// Print OpenSearch version information on console. +var_dump($client->info()); + +// Create an index with non-default settings. +$client->indices()->create([ + 'index' => $indexName, + 'body' => [ + 'settings' => [ + 'index' => [ + 'number_of_shards' => 4 + ] + ] + ] +]); + +$client->create([ + 'index' => $indexName, + 'id' => 1, + 'body' => [ + 'title' => 'Moneyball', + 'director' => 'Bennett Miller', + 'year' => 2011 + ] +]); + +// Search for it +var_dump( + $client->search([ + 'index' => $indexName, + 'body' => [ + 'size' => 5, + 'query' => [ + 'multi_match' => [ + 'query' => 'miller', + 'fields' => ['title^2', 'director'] + ] + ] + ] + ]) +); + +// Delete a single document +$client->delete([ + 'index' => $indexName, + 'id' => 1, +]); + + +// Delete index +$client->indices()->delete([ + 'index' => $indexName +]); + +?> +``` +{% include copy.html %} \ No newline at end of file diff --git a/_clients/python-high-level.md b/_clients/python-high-level.md new file mode 100644 index 00000000..ff529eb0 --- /dev/null +++ b/_clients/python-high-level.md @@ -0,0 +1,306 @@ +--- +layout: default +title: High-level Python client +nav_order: 5 +--- + +The OpenSearch high-level Python client (`opensearch-dsl-py`) will be deprecated after version 2.1.0. We recommend switching to the [Python client (`opensearch-py`)]({{site.url}}{{site.baseurl}}/clients/python-low-level/), which now includes the functionality of `opensearch-dsl-py`. +{: .warning} + +# High-level Python client + +The OpenSearch high-level Python client (`opensearch-dsl-py`) provides wrapper classes for common OpenSearch entities, like documents, so you can work with them as Python objects. Additionally, the high-level client simplifies writing queries and supplies convenient Python methods for common OpenSearch operations. The high-level Python client supports creating and indexing documents, searching with and without filters, and updating documents using queries. + +This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [opensearch-dsl-py repo](https://github.com/opensearch-project/opensearch-dsl-py). + +## Setup + +To add the client to your project, install it using [pip](https://pip.pypa.io/): + +```bash +pip install opensearch-dsl +``` +{% include copy.html %} + +After installing the client, you can import it like any other module: + +```python +from opensearchpy import OpenSearch +from opensearch_dsl import Search +``` +{% include copy.html %} + +## Connecting to OpenSearch + +To connect to the default OpenSearch host, create a client object with SSL enabled if you are using the Security plugin. You can use the default credentials for testing purposes: + +```python +host = 'localhost' +port = 9200 +auth = ('admin', 'admin') # For testing only. Don't store credentials in code. +ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA. + +# Create the client with SSL/TLS enabled, but hostname verification disabled. +client = OpenSearch( + hosts = [{'host': host, 'port': port}], + http_compress = True, # enables gzip compression for request bodies + http_auth = auth, + use_ssl = True, + verify_certs = True, + ssl_assert_hostname = False, + ssl_show_warn = False, + ca_certs = ca_certs_path +) +``` +{% include copy.html %} + +If you have your own client certificates, specify them in the `client_cert_path` and `client_key_path` parameters: + +```python +host = 'localhost' +port = 9200 +auth = ('admin', 'admin') # For testing only. Don't store credentials in code. +ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA. + +# Optional client certificates if you don't want to use HTTP basic authentication. +client_cert_path = '/full/path/to/client.pem' +client_key_path = '/full/path/to/client-key.pem' + +# Create the client with SSL/TLS enabled, but hostname verification disabled. +client = OpenSearch( + hosts = [{'host': host, 'port': port}], + http_compress = True, # enables gzip compression for request bodies + http_auth = auth, + client_cert = client_cert_path, + client_key = client_key_path, + use_ssl = True, + verify_certs = True, + ssl_assert_hostname = False, + ssl_show_warn = False, + ca_certs = ca_certs_path +) +``` +{% include copy.html %} + +If you are not using the Security plugin, create a client object with SSL disabled: + +```python +host = 'localhost' +port = 9200 + +# Create the client with SSL/TLS and hostname verification disabled. +client = OpenSearch( + hosts = [{'host': host, 'port': port}], + http_compress = True, # enables gzip compression for request bodies + use_ssl = False, + verify_certs = False, + ssl_assert_hostname = False, + ssl_show_warn = False +) +``` +{% include copy.html %} + +## Creating an index + +To create an OpenSearch index, use the `client.indices.create()` method. You can use the following code to construct a JSON object with custom settings: + +```python +index_name = 'my-dsl-index' +index_body = { + 'settings': { + 'index': { + 'number_of_shards': 4 + } + } +} + +response = client.indices.create(index_name, body=index_body) +``` +{% include copy.html %} + +## Indexing a document + +You can create a class to represent the documents that you'll index in OpenSearch by extending the `Document` class: + +```python +class Movie(Document): + title = Text(fields={'raw': Keyword()}) + director = Text() + year = Text() + + class Index: + name = index_name + + def save(self, ** kwargs): + return super(Movie, self).save(** kwargs) +``` +{% include copy.html %} + +To index a document, create an object of the new class and call its `save()` method: + +```python +# Set up the opensearch-py version of the document +Movie.init(using=client) +doc = Movie(meta={'id': 1}, title='Moneyball', director='Bennett Miller', year='2011') +response = doc.save(using=client) +``` +{% include copy.html %} + +## Performing bulk operations + +You can perform several operations at the same time by using the `bulk()` method of the client. The operations may be of the same type or of different types. Note that the operations must be separated by a `\n` and the entire string must be a single line: + +```python +movies = '{ "index" : { "_index" : "my-dsl-index", "_id" : "2" } } \n { "title" : "Interstellar", "director" : "Christopher Nolan", "year" : "2014"} \n { "create" : { "_index" : "my-dsl-index", "_id" : "3" } } \n { "title" : "Star Trek Beyond", "director" : "Justin Lin", "year" : "2015"} \n { "update" : {"_id" : "3", "_index" : "my-dsl-index" } } \n { "doc" : {"year" : "2016"} }' + +client.bulk(movies) +``` +{% include copy.html %} + +## Searching for documents + +You can use the `Search` class to construct a query. The following code creates a Boolean query with a filter: + +```python +s = Search(using=client, index=index_name) \ + .filter("term", year="2011") \ + .query("match", title="Moneyball") + +response = s.execute() +``` +{% include copy.html %} + +The preceding query is equivalent to the following query in OpenSearch domain-specific language (DSL): + +```json +GET my-dsl-index/_search +{ + "query": { + "bool": { + "must": { + "match": { + "title": "Moneyball" + } + }, + "filter": { + "term" : { + "year": 2011 + } + } + } + } +} +``` + +## Deleting a document + +You can delete a document using the `client.delete()` method: + +```python +response = client.delete( + index = 'my-dsl-index', + id = '1' +) +``` +{% include copy.html %} + +## Deleting an index + +You can delete an index using the `client.indices.delete()` method: + +```python +response = client.indices.delete( + index = 'my-dsl-index' +) +``` +{% include copy.html %} + +## Sample program + +The following sample program creates a client, adds an index with non-default settings, inserts a document, performs bulk operations, searches for the document, deletes the document, and then deletes the index: + +```python +from opensearchpy import OpenSearch +from opensearch_dsl import Search, Document, Text, Keyword + +host = 'localhost' +port = 9200 + +auth = ('admin', 'admin') # For testing only. Don't store credentials in code. +ca_certs_path = 'root-ca.pem' + +# Create the client with SSL/TLS enabled, but hostname verification disabled. +client = OpenSearch( + hosts=[{'host': host, 'port': port}], + http_compress=True, # enables gzip compression for request bodies + # http_auth=auth, + use_ssl=False, + verify_certs=False, + ssl_assert_hostname=False, + ssl_show_warn=False, + # ca_certs=ca_certs_path +) +index_name = 'my-dsl-index' + +index_body = { + 'settings': { + 'index': { + 'number_of_shards': 4 + } + } +} + +response = client.indices.create(index_name, index_body) +print('\nCreating index:') +print(response) + +# Create the structure of the document +class Movie(Document): + title = Text(fields={'raw': Keyword()}) + director = Text() + year = Text() + + class Index: + name = index_name + + def save(self, ** kwargs): + return super(Movie, self).save(** kwargs) + +# Set up the opensearch-py version of the document +Movie.init(using=client) +doc = Movie(meta={'id': 1}, title='Moneyball', director='Bennett Miller', year='2011') +response = doc.save(using=client) + +print('\nAdding document:') +print(response) + +# Perform bulk operations + +movies = '{ "index" : { "_index" : "my-dsl-index", "_id" : "2" } } \n { "title" : "Interstellar", "director" : "Christopher Nolan", "year" : "2014"} \n { "create" : { "_index" : "my-dsl-index", "_id" : "3" } } \n { "title" : "Star Trek Beyond", "director" : "Justin Lin", "year" : "2015"} \n { "update" : {"_id" : "3", "_index" : "my-dsl-index" } } \n { "doc" : {"year" : "2016"} }' + +client.bulk(movies) + +# Search for the document. +s = Search(using=client, index=index_name) \ + .filter('term', year='2011') \ + .query('match', title='Moneyball') + +response = s.execute() + +print('\nSearch results:') +for hit in response: + print(hit.meta.score, hit.title) + +# Delete the document. +print('\nDeleting document:') +print(response) + +# Delete the index. +response = client.indices.delete( + index = index_name +) + +print('\nDeleting index:') +print(response) +``` +{% include copy.html %} \ No newline at end of file diff --git a/_clients/python-low-level.md b/_clients/python-low-level.md new file mode 100644 index 00000000..894bef0e --- /dev/null +++ b/_clients/python-low-level.md @@ -0,0 +1,356 @@ +--- +layout: default +title: Low-level Python client +nav_order: 10 +redirect_from: + - /clients/python/ +--- + +# Low-level Python client + +The OpenSearch low-level Python client (`opensearch-py`) provides wrapper methods for the OpenSearch REST API so that you can interact with your cluster more naturally in Python. Rather than sending raw HTTP requests to a given URL, you can create an OpenSearch client for your cluster and call the client's built-in functions. For the client's complete API documentation and additional examples, see the [`opensearch-py` API documentation](https://opensearch-project.github.io/opensearch-py/). + +This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [`opensearch-py` repo](https://github.com/opensearch-project/opensearch-py). + +## Setup + +To add the client to your project, install it using [pip](https://pip.pypa.io/): + +```bash +pip install opensearch-py +``` +{% include copy.html %} + +After installing the client, you can import it like any other module: + +```python +from opensearchpy import OpenSearch +``` +{% include copy.html %} + +## Connecting to OpenSearch + +To connect to the default OpenSearch host, create a client object with SSL enabled if you are using the Security plugin. You can use the default credentials for testing purposes: + +```python +host = 'localhost' +port = 9200 +auth = ('admin', 'admin') # For testing only. Don't store credentials in code. +ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA. + +# Create the client with SSL/TLS enabled, but hostname verification disabled. +client = OpenSearch( + hosts = [{'host': host, 'port': port}], + http_compress = True, # enables gzip compression for request bodies + http_auth = auth, + use_ssl = True, + verify_certs = True, + ssl_assert_hostname = False, + ssl_show_warn = False, + ca_certs = ca_certs_path +) +``` +{% include copy.html %} + +If you have your own client certificates, specify them in the `client_cert_path` and `client_key_path` parameters: + +```python +host = 'localhost' +port = 9200 +auth = ('admin', 'admin') # For testing only. Don't store credentials in code. +ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA. + +# Optional client certificates if you don't want to use HTTP basic authentication. +client_cert_path = '/full/path/to/client.pem' +client_key_path = '/full/path/to/client-key.pem' + +# Create the client with SSL/TLS enabled, but hostname verification disabled. +client = OpenSearch( + hosts = [{'host': host, 'port': port}], + http_compress = True, # enables gzip compression for request bodies + http_auth = auth, + client_cert = client_cert_path, + client_key = client_key_path, + use_ssl = True, + verify_certs = True, + ssl_assert_hostname = False, + ssl_show_warn = False, + ca_certs = ca_certs_path +) +``` +{% include copy.html %} + +If you are not using the Security plugin, create a client object with SSL disabled: + +```python +host = 'localhost' +port = 9200 + +# Create the client with SSL/TLS and hostname verification disabled. +client = OpenSearch( + hosts = [{'host': host, 'port': port}], + http_compress = True, # enables gzip compression for request bodies + use_ssl = False, + verify_certs = False, + ssl_assert_hostname = False, + ssl_show_warn = False +) +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Service + +The following example illustrates connecting to Amazon OpenSearch Service: + +```python +from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth +import boto3 + +host = '' # cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com +region = 'us-west-2' +service = 'es' +credentials = boto3.Session().get_credentials() +auth = AWSV4SignerAuth(credentials, region, service) + +client = OpenSearch( + hosts = [{'host': host, 'port': 443}], + http_auth = auth, + use_ssl = True, + verify_certs = True, + connection_class = RequestsHttpConnection, + pool_maxsize = 20 +) +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Serverless + +The following example illustrates connecting to Amazon OpenSearch Serverless Service: + +```python +from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth +import boto3 + +host = '' # cluster endpoint, for example: my-test-domain.us-east-1.aoss.amazonaws.com +region = 'us-west-2' +service = 'aoss' +credentials = boto3.Session().get_credentials() +auth = AWSV4SignerAuth(credentials, region, service) + +client = OpenSearch( + hosts = [{'host': host, 'port': 443}], + http_auth = auth, + use_ssl = True, + verify_certs = True, + connection_class = RequestsHttpConnection, + pool_maxsize = 20 +) +``` +{% include copy.html %} + + +## Creating an index + +To create an OpenSearch index, use the `client.indices.create()` method. You can use the following code to construct a JSON object with custom settings: + +```python +index_name = 'python-test-index' +index_body = { + 'settings': { + 'index': { + 'number_of_shards': 4 + } + } +} + +response = client.indices.create(index_name, body=index_body) +``` +{% include copy.html %} + +## Indexing a document + +You can index a document using the `client.index()` method: + +```python +document = { + 'title': 'Moneyball', + 'director': 'Bennett Miller', + 'year': '2011' +} + +response = client.index( + index = 'python-test-index', + body = document, + id = '1', + refresh = True +) +``` +{% include copy.html %} + +## Performing bulk operations + +You can perform several operations at the same time by using the `bulk()` method of the client. The operations may be of the same type or of different types. Note that the operations must be separated by a `\n` and the entire string must be a single line: + +```python +movies = '{ "index" : { "_index" : "my-dsl-index", "_id" : "2" } } \n { "title" : "Interstellar", "director" : "Christopher Nolan", "year" : "2014"} \n { "create" : { "_index" : "my-dsl-index", "_id" : "3" } } \n { "title" : "Star Trek Beyond", "director" : "Justin Lin", "year" : "2015"} \n { "update" : {"_id" : "3", "_index" : "my-dsl-index" } } \n { "doc" : {"year" : "2016"} }' + +client.bulk(movies) +``` +{% include copy.html %} + +## Searching for documents + +The easiest way to search for documents is to construct a query string. The following code uses a multi-match query to search for “miller” in the title and director fields. It boosts the documents that have “miller” in the title field: + +```python +q = 'miller' +query = { + 'size': 5, + 'query': { + 'multi_match': { + 'query': q, + 'fields': ['title^2', 'director'] + } + } +} + +response = client.search( + body = query, + index = 'python-test-index' +) +``` +{% include copy.html %} + +## Deleting a document + +You can delete a document using the `client.delete()` method: + +```python +response = client.delete( + index = 'python-test-index', + id = '1' +) +``` +{% include copy.html %} + +## Deleting an index + +You can delete an index using the `client.indices.delete()` method: + +```python +response = client.indices.delete( + index = 'python-test-index' +) +``` +{% include copy.html %} + +## Sample program + +The following sample program creates a client, adds an index with non-default settings, inserts a document, performs bulk operations, searches for the document, deletes the document, and then deletes the index: + +```python +from opensearchpy import OpenSearch + +host = 'localhost' +port = 9200 +auth = ('admin', 'admin') # For testing only. Don't store credentials in code. +ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA. + +# Optional client certificates if you don't want to use HTTP basic authentication. +# client_cert_path = '/full/path/to/client.pem' +# client_key_path = '/full/path/to/client-key.pem' + +# Create the client with SSL/TLS enabled, but hostname verification disabled. +client = OpenSearch( + hosts = [{'host': host, 'port': port}], + http_compress = True, # enables gzip compression for request bodies + http_auth = auth, + # client_cert = client_cert_path, + # client_key = client_key_path, + use_ssl = True, + verify_certs = True, + ssl_assert_hostname = False, + ssl_show_warn = False, + ca_certs = ca_certs_path +) + +# Create an index with non-default settings. +index_name = 'python-test-index' +index_body = { + 'settings': { + 'index': { + 'number_of_shards': 4 + } + } +} + +response = client.indices.create(index_name, body=index_body) +print('\nCreating index:') +print(response) + +# Add a document to the index. +document = { + 'title': 'Moneyball', + 'director': 'Bennett Miller', + 'year': '2011' +} +id = '1' + +response = client.index( + index = index_name, + body = document, + id = id, + refresh = True +) + +print('\nAdding document:') +print(response) + +# Perform bulk operations + +movies = '{ "index" : { "_index" : "my-dsl-index", "_id" : "2" } } \n { "title" : "Interstellar", "director" : "Christopher Nolan", "year" : "2014"} \n { "create" : { "_index" : "my-dsl-index", "_id" : "3" } } \n { "title" : "Star Trek Beyond", "director" : "Justin Lin", "year" : "2015"} \n { "update" : {"_id" : "3", "_index" : "my-dsl-index" } } \n { "doc" : {"year" : "2016"} }' + +client.bulk(movies) + +# Search for the document. +q = 'miller' +query = { + 'size': 5, + 'query': { + 'multi_match': { + 'query': q, + 'fields': ['title^2', 'director'] + } + } +} + +response = client.search( + body = query, + index = index_name +) +print('\nSearch results:') +print(response) + +# Delete the document. +response = client.delete( + index = index_name, + id = id +) + +print('\nDeleting document:') +print(response) + +# Delete the index. +response = client.indices.delete( + index = index_name +) + +print('\nDeleting index:') +print(response) +``` +{% include copy.html %} + +## Next steps + +- For Python client API, see the [`opensearch-py` API documentation](https://opensearch-project.github.io/opensearch-py/). +- For Python code samples, see [Samples](https://github.com/opensearch-project/opensearch-py/tree/main/samples). \ No newline at end of file diff --git a/_clients/python.md b/_clients/python.md deleted file mode 100644 index 10a856a2..00000000 --- a/_clients/python.md +++ /dev/null @@ -1,128 +0,0 @@ ---- -layout: default -title: Python client -nav_order: 70 ---- - -# Python client - -The OpenSearch Python client provides a more natural syntax for interacting with your cluster. Rather than sending HTTP requests to a given URL, you can create an OpenSearch client for your cluster and call the client's built-in functions. - -{% comment %} -`opensearch-py` is the lower-level of the two Python clients. If you want a general client for assorted operations, it's a great choice. If you want a higher-level client strictly for indexing and search operations, consider [opensearch-dsl-py]({{site.url}}{{site.baseurl}}/clients/python-dsl/). -{% endcomment %} - - -## Setup - -To add the client to your project, install it using [pip](https://pip.pypa.io/): - -```bash -pip install opensearch-py -``` - -Then import it like any other module: - -```python -from opensearchpy import OpenSearch -``` - -If you prefer to add the client manually or just want to examine the source code, see [opensearch-py on GitHub](https://github.com/opensearch-project/opensearch-py). - - -## Sample code - -```python -from opensearchpy import OpenSearch - -host = 'localhost' -port = 9200 -auth = ('admin', 'admin') # For testing only. Don't store credentials in code. -ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA. - -# Optional client certificates if you don't want to use HTTP basic authentication. -# client_cert_path = '/full/path/to/client.pem' -# client_key_path = '/full/path/to/client-key.pem' - -# Create the client with SSL/TLS enabled, but hostname verification disabled. -client = OpenSearch( - hosts = [{'host': host, 'port': port}], - http_compress = True, # enables gzip compression for request bodies - http_auth = auth, - # client_cert = client_cert_path, - # client_key = client_key_path, - use_ssl = True, - verify_certs = True, - ssl_assert_hostname = False, - ssl_show_warn = False, - ca_certs = ca_certs_path -) - -# Create an index with non-default settings. -index_name = 'python-test-index' -index_body = { - 'settings': { - 'index': { - 'number_of_shards': 4 - } - } -} - -response = client.indices.create(index_name, body=index_body) -print('\nCreating index:') -print(response) - -# Add a document to the index. -document = { - 'title': 'Moneyball', - 'director': 'Bennett Miller', - 'year': '2011' -} -id = '1' - -response = client.index( - index = index_name, - body = document, - id = id, - refresh = True -) - -print('\nAdding document:') -print(response) - -# Search for the document. -q = 'miller' -query = { - 'size': 5, - 'query': { - 'multi_match': { - 'query': q, - 'fields': ['title^2', 'director'] - } - } -} - -response = client.search( - body = query, - index = index_name -) -print('\nSearch results:') -print(response) - -# Delete the document. -response = client.delete( - index = index_name, - id = id -) - -print('\nDeleting document:') -print(response) - -# Delete the index. -response = client.indices.delete( - index = index_name -) - -print('\nDeleting index:') -print(response) -``` diff --git a/_clients/ruby.md b/_clients/ruby.md new file mode 100644 index 00000000..7d582927 --- /dev/null +++ b/_clients/ruby.md @@ -0,0 +1,656 @@ +--- +layout: default +title: Ruby client +nav_order: 60 +has_children: false +--- + +# Ruby client + +The OpenSearch Ruby client allows you to interact with your OpenSearch clusters through Ruby methods rather than HTTP methods and raw JSON. For the client's complete API documentation and additional examples, see the [`opensearch-transport`](https://rubydoc.info/gems/opensearch-transport), [`opensearch-api`](https://rubydoc.info/gems/opensearch-api), [`opensearch-dsl`](https://rubydoc.info/gems/opensearch-dsl), and [`opensearch-ruby`](https://rubydoc.info/gems/opensearch-ruby/) gem documentation. + +This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [opensearch-ruby repo](https://github.com/opensearch-project/opensearch-ruby). + +## Installing the Ruby client + +To install the Ruby gem for the Ruby client, run the following command: + +```bash +gem install opensearch-ruby +``` +{% include copy.html %} + +To use the client, import it as a module: + +```ruby +require 'opensearch' +``` +{% include copy.html %} + +## Connecting to OpenSearch + +To connect to the default OpenSearch host, create a client object, passing the default host address in the constructor: + +```ruby +client = OpenSearch::Client.new(host: 'http://localhost:9200') +``` +{% include copy.html %} + +The following example creates a client object with a custom URL and the `log` option set to `true`. It sets the `retry_on_failure` parameter to retry a failed request five times rather than the default three times. Finally, it increases the timeout by setting the `request_timeout` parameter to 120 seconds. It then returns the basic cluster health information: + +```ruby +client = OpenSearch::Client.new( + url: "http://localhost:9200", + retry_on_failure: 5, + request_timeout: 120, + log: true + ) + +client.cluster.health +``` +{% include copy.html %} + +The output is as follows: + +```bash +2022-08-25 14:24:52 -0400: GET http://localhost:9200/ [status:200, request:0.048s, query:n/a] +2022-08-25 14:24:52 -0400: < { + "name" : "opensearch", + "cluster_name" : "docker-cluster", + "cluster_uuid" : "Aw0F5Pt9QF6XO9vXQHIs_w", + "version" : { + "distribution" : "opensearch", + "number" : "2.2.0", + "build_type" : "tar", + "build_hash" : "b1017fa3b9a1c781d4f34ecee411e0cdf930a515", + "build_date" : "2022-08-09T02:27:25.256769336Z", + "build_snapshot" : false, + "lucene_version" : "9.3.0", + "minimum_wire_compatibility_version" : "7.10.0", + "minimum_index_compatibility_version" : "7.0.0" + }, + "tagline" : "The OpenSearch Project: https://opensearch.org/" +} + +2022-08-25 14:24:52 -0400: GET http://localhost:9200/_cluster/health [status:200, request:0.018s, query:n/a] +2022-08-25 14:24:52 -0400: < {"cluster_name":"docker-cluster","status":"yellow","timed_out":false,"number_of_nodes":1,"number_of_data_nodes":1,"discovered_master":true,"discovered_cluster_manager":true,"active_primary_shards":10,"active_shards":10,"relocating_shards":0,"initializing_shards":0,"unassigned_shards":8,"delayed_unassigned_shards":0,"number_of_pending_tasks":0,"number_of_in_flight_fetch":0,"task_max_waiting_in_queue_millis":0,"active_shards_percent_as_number":55.55555555555556} +``` + +## Connecting to Amazon OpenSearch Service + +To connect to Amazon OpenSearch Service, first install the `opensearch-aws-sigv4` gem: + +```bash +gem install opensearch-aws-sigv4 +``` + +```ruby +require 'opensearch-aws-sigv4' +require 'aws-sigv4' + +signer = Aws::Sigv4::Signer.new(service: 'es', + region: 'us-west-2', # signing service region + access_key_id: 'key_id', + secret_access_key: 'secret') + +client = OpenSearch::Aws::Sigv4Client.new({ + host: 'https://your.amz-managed-opensearch.domain', + log: true +}, signer) + +# create an index and document +index = 'prime' +client.indices.create(index: index) +client.index(index: index, id: '1', body: { name: 'Amazon Echo', + msrp: '5999', + year: 2011 }) + +# search for the document +client.search(body: { query: { match: { name: 'Echo' } } }) + +# delete the document +client.delete(index: index, id: '1') + +# delete the index +client.indices.delete(index: index) +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Serverless + +To connect to Amazon OpenSearch Serverless Service, first install the `opensearch-aws-sigv4` gem: + +```bash +gem install opensearch-aws-sigv4 +``` + +```ruby +require 'opensearch-aws-sigv4' +require 'aws-sigv4' + +signer = Aws::Sigv4::Signer.new(service: 'aoss', + region: 'us-west-2', # signing service region + access_key_id: 'key_id', + secret_access_key: 'secret') + +client = OpenSearch::Aws::Sigv4Client.new({ + host: 'https://your.amz-managed-opensearch.domain', # serverless endpoint for OpenSearch Serverless + log: true +}, signer) + +# create an index and document +index = 'prime' +client.indices.create(index: index) +client.index(index: index, id: '1', body: { name: 'Amazon Echo', + msrp: '5999', + year: 2011 }) + +# search for the document +client.search(body: { query: { match: { name: 'Echo' } } }) + +# delete the document +client.delete(index: index, id: '1') + +# delete the index +client.indices.delete(index: index) +``` +{% include copy.html %} + + +## Creating an index + +You don't need to create an index explicitly in OpenSearch. Once you upload a document into an index that does not exist, OpenSearch creates the index automatically. Alternatively, you can create an index explicitly to specify settings like the number of primary and replica shards. To create an index with non-default settings, create an index body hash with those settings: + +```ruby +index_body = { + 'settings': { + 'index': { + 'number_of_shards': 1, + 'number_of_replicas': 2 + } + } +} + +client.indices.create( + index: 'students', + body: index_body +) +``` +{% include copy.html %} + +## Mappings + +OpenSearch uses dynamic mapping to infer field types of the documents that are indexed. However, to have more control over the schema of your document, you can pass an explicit mapping to OpenSearch. You can define data types for some or all fields of your document in this mapping. To create a mapping for an index, use the `put_mapping` method: + +```ruby +client.indices.put_mapping( + index: 'students', + body: { + properties: { + first_name: { type: 'keyword' }, + last_name: { type: 'keyword' } + } + } +) +``` +{% include copy.html %} + +By default, string fields are mapped as `text`, but in the mapping above, the `first_name` and `last_name` fields are mapped as `keyword`. This mapping signals to OpenSearch that these fields should not be analyzed and should support only full case-sensitive matches. + +You can verify the index's mappings using the `get_mapping` method: + +```ruby +response = client.indices.get_mapping(index: 'students') +``` +{% include copy.html %} + +If you know the mapping of your documents in advance and want to avoid mapping errors (for example, misspellings of a field name), you can set the `dynamic` parameter to `strict`: + +```ruby +client.indices.put_mapping( + index: 'students', + body: { + dynamic: 'strict', + properties: { + first_name: { type: 'keyword' }, + last_name: { type: 'keyword' }, + gpa: { type: 'float'}, + grad_year: { type: 'integer'} + } + } +) +``` +{% include copy.html %} + +With strict mapping, you can index a document with a missing field, but you cannot index a document with a new field. For example, indexing the following document with a misspelled `grad_yea` field fails: + +```ruby +document = { + first_name: 'Connor', + last_name: 'James', + gpa: 3.93, + grad_yea: 2021 +} + +client.index( + index: 'students', + body: document, + id: 100, + refresh: true +) +``` +{% include copy.html %} + +OpenSearch returns a mapping error: + +```bash +{"error":{"root_cause":[{"type":"strict_dynamic_mapping_exception","reason":"mapping set to strict, dynamic introduction of [grad_yea] within [_doc] is not allowed"}],"type":"strict_dynamic_mapping_exception","reason":"mapping set to strict, dynamic introduction of [grad_yea] within [_doc] is not allowed"},"status":400} +``` + +## Indexing one document + +To index one document, use the `index` method: + +```ruby +document = { + first_name: 'Connor', + last_name: 'James', + gpa: 3.93, + grad_year: 2021 +} + +client.index( + index: 'students', + body: document, + id: 100, + refresh: true +) +``` +{% include copy.html %} + +## Updating a document + +To update a document, use the `update` method: + +```ruby +client.update(index: 'students', + id: 100, + body: { doc: { gpa: 3.25 } }, + refresh: true) +``` +{% include copy.html %} + +## Deleting a document + +To delete a document, use the `delete` method: + +```ruby +client.delete( + index: 'students', + id: 100, + refresh: true +) +``` +{% include copy.html %} + +## Bulk operations + +You can perform several operations at the same time by using the `bulk` method. The operations may be of the same type or of different types. + +You can index multiple documents using the `bulk` method: + +```ruby +actions = [ + { index: { _index: 'students', _id: '200' } }, + { first_name: 'James', last_name: 'Rodriguez', gpa: 3.91, grad_year: 2019 }, + { index: { _index: 'students', _id: '300' } }, + { first_name: 'Nikki', last_name: 'Wolf', gpa: 3.87, grad_year: 2020 } +] +client.bulk(body: actions, refresh: true) +``` +{% include copy.html %} + +You can delete multiple documents as follows: + +```ruby +# Deleting multiple documents. +actions = [ + { delete: { _index: 'students', _id: 200 } }, + { delete: { _index: 'students', _id: 300 } } +] +client.bulk(body: actions, refresh: true) +``` +{% include copy.html %} + +You can perform different operations when using `bulk` as follows: + +```ruby +actions = [ + { index: { _index: 'students', _id: 100, data: { first_name: 'Paulo', last_name: 'Santos', gpa: 3.29, grad_year: 2022 } } }, + { index: { _index: 'students', _id: 200, data: { first_name: 'Shirley', last_name: 'Rodriguez', gpa: 3.92, grad_year: 2020 } } }, + { index: { _index: 'students', _id: 300, data: { first_name: 'Akua', last_name: 'Mansa', gpa: 3.95, grad_year: 2022 } } }, + { index: { _index: 'students', _id: 400, data: { first_name: 'John', last_name: 'Stiles', gpa: 3.72, grad_year: 2019 } } }, + { index: { _index: 'students', _id: 500, data: { first_name: 'Li', last_name: 'Juan', gpa: 3.94, grad_year: 2022 } } }, + { index: { _index: 'students', _id: 600, data: { first_name: 'Richard', last_name: 'Roe', gpa: 3.04, grad_year: 2020 } } }, + { update: { _index: 'students', _id: 100, data: { doc: { gpa: 3.73 } } } }, + { delete: { _index: 'students', _id: 200 } } +] +client.bulk(body: actions, refresh: true) +``` +{% include copy.html %} + +In the above example, you pass the data and the header together and you denote the data with the `data:` key. + +## Searching for a document + +To search for a document, use the `search` method. The following example searches for a student whose first or last name is "James." It uses a `multi_match` query to search for two fields (`first_name` and `last_name`), and it is boosting the `last_name` field in relevance with a caret notation (`last_name^2`). + +```ruby +q = 'James' +query = { + 'size': 5, + 'query': { + 'multi_match': { + 'query': q, + 'fields': ['first_name', 'last_name^2'] + } + } +} + +response = client.search( + body: query, + index: 'students' +) +``` +{% include copy.html %} + +If you omit the request body in the `search` method, your query becomes a `match_all` query and returns all documents in the index: + +```ruby +client.search(index: 'students') +``` +{% include copy.html %} + +## Boolean query + +The Ruby client exposes full OpenSearch query capability. In addition to simple searches that use the match query, you can create a more complex Boolean query to search for students who graduated in 2022 and sort them by last name. In the example below, search is limited to 10 documents. + +```ruby +query = { + 'query': { + 'bool': { + 'filter': { + 'term': { + 'grad_year': 2022 + + } + } + } + }, + 'sort': { + 'last_name': { + 'order': 'asc' + } + } +} + +response = client.search(index: 'students', from: 0, size: 10, body: query) +``` +{% include copy.html %} + +## Multi-search + +You can bulk several queries together and perform a multi-search using the `msearch` method. The following code searches for students whose GPAs are outside the 3.1–3.9 range: + +```ruby +actions = [ + {}, + {query: {range: {gpa: {gt: 3.9}}}}, + {}, + {query: {range: {gpa: {lt: 3.1}}}} +] +response = client.msearch(index: 'students', body: actions) +``` +{% include copy.html %} + +## Scroll + +You can paginate your search results using the Scroll API: + +```ruby +response = client.search(index: index_name, scroll: '2m', size: 2) + +while response['hits']['hits'].size.positive? + scroll_id = response['_scroll_id'] + puts(response['hits']['hits'].map { |doc| [doc['_source']['first_name'] + ' ' + doc['_source']['last_name']] }) + response = client.scroll(scroll: '1m', body: { scroll_id: scroll_id }) +end +``` +{% include copy.html %} + +First, you issue a search query, specifying the `scroll` and `size` parameters. The `scroll` parameter tells OpenSearch how long to keep the search context. In this case, it is set to two minutes. The `size` parameter specifies how many documents you want to return in each request. + +The response to the initial search query contains a `_scroll_id` that you can use to get the next set of documents. To do this, you use the `scroll` method, again specifying the `scroll` parameter and passing the `_scroll_id` in the body. You don't need to specify the query or index to the `scroll` method. The `scroll` method returns the next set of documents and the `_scroll_id`. It's important to use the latest `_scroll_id` when requesting the next batch of documents because `_scroll_id` can change between requests. + +## Deleting an index + +You can delete the index using the `delete` method: + +```ruby +response = client.indices.delete(index: index_name) +``` +{% include copy.html %} + +## Sample program + +The following is a complete sample program that illustrates all of the concepts described in the preceding sections. The Ruby client's methods return responses as Ruby hashes, which are hard to read. To display JSON responses in a pretty format, the sample program uses the `MultiJson.dump` method. + +```ruby +require 'opensearch' + +client = OpenSearch::Client.new(host: 'http://localhost:9200') + +# Create an index with non-default settings +index_name = 'students' +index_body = { + 'settings': { + 'index': { + 'number_of_shards': 1, + 'number_of_replicas': 2 + } + } + } + +client.indices.create( + index: index_name, + body: index_body +) + +# Create a mapping +client.indices.put_mapping( + index: index_name, + body: { + properties: { + first_name: { type: 'keyword' }, + last_name: { type: 'keyword' } + } + } +) + +# Get mappings +response = client.indices.get_mapping(index: index_name) +puts 'Mappings for the students index:' +puts MultiJson.dump(response, pretty: "true") + +# Add one document to the index +puts 'Adding one document:' +document = { + first_name: 'Connor', + last_name: 'James', + gpa: 3.93, + grad_year: 2021 +} +id = 100 + +client.index( + index: index_name, + body: document, + id: id, + refresh: true +) + +response = client.search(index: index_name) +puts MultiJson.dump(response, pretty: "true") + +# Update a document +puts 'Updating a document:' +client.update(index: index_name, id: id, body: { doc: { gpa: 3.25 } }, refresh: true) +response = client.search(index: index_name) +puts MultiJson.dump(response, pretty: "true") +print 'The updated gpa is ' +puts response['hits']['hits'].map { |doc| doc['_source']['gpa'] } + +# Add many documents in bulk +documents = [ +{ index: { _index: index_name, _id: '200' } }, +{ first_name: 'James', last_name: 'Rodriguez', gpa: 3.91, grad_year: 2019}, +{ index: { _index: index_name, _id: '300' } }, +{ first_name: 'Nikki', last_name: 'Wolf', gpa: 3.87, grad_year: 2020} +] +client.bulk(body: documents, refresh: true) + +# Get all documents in the index +response = client.search(index: index_name) +puts 'All documents in the index after bulk upload:' +puts MultiJson.dump(response, pretty: "true") + +# Search for a document using a multi_match query +puts 'Searching for documents that match "James":' +q = 'James' +query = { + 'size': 5, + 'query': { + 'multi_match': { + 'query': q, + 'fields': ['first_name', 'last_name^2'] + } + } +} + +response = client.search( + body: query, + index: index_name +) +puts MultiJson.dump(response, pretty: "true") + +# Delete the document +response = client.delete( +index: index_name, +id: id, +refresh: true +) + +response = client.search(index: index_name) +puts 'Documents in the index after one document was deleted:' +puts MultiJson.dump(response, pretty: "true") + +# Delete multiple documents +actions = [ + { delete: { _index: index_name, _id: 200 } }, + { delete: { _index: index_name, _id: 300 } } +] +client.bulk(body: actions, refresh: true) + +response = client.search(index: index_name) + +puts 'Documents in the index after all documents were deleted:' +puts MultiJson.dump(response, pretty: "true") + +# Bulk several operations together +actions = [ + { index: { _index: index_name, _id: 100, data: { first_name: 'Paulo', last_name: 'Santos', gpa: 3.29, grad_year: 2022 } } }, + { index: { _index: index_name, _id: 200, data: { first_name: 'Shirley', last_name: 'Rodriguez', gpa: 3.92, grad_year: 2020 } } }, + { index: { _index: index_name, _id: 300, data: { first_name: 'Akua', last_name: 'Mansa', gpa: 3.95, grad_year: 2022 } } }, + { index: { _index: index_name, _id: 400, data: { first_name: 'John', last_name: 'Stiles', gpa: 3.72, grad_year: 2019 } } }, + { index: { _index: index_name, _id: 500, data: { first_name: 'Li', last_name: 'Juan', gpa: 3.94, grad_year: 2022 } } }, + { index: { _index: index_name, _id: 600, data: { first_name: 'Richard', last_name: 'Roe', gpa: 3.04, grad_year: 2020 } } }, + { update: { _index: index_name, _id: 100, data: { doc: { gpa: 3.73 } } } }, + { delete: { _index: index_name, _id: 200 } } +] +client.bulk(body: actions, refresh: true) + +puts 'All documents in the index after bulk operations with scrolling:' +response = client.search(index: index_name, scroll: '2m', size: 2) + +while response['hits']['hits'].size.positive? + scroll_id = response['_scroll_id'] + puts(response['hits']['hits'].map { |doc| [doc['_source']['first_name'] + ' ' + doc['_source']['last_name']] }) + response = client.scroll(scroll: '1m', body: { scroll_id: scroll_id }) +end + +# Multi search +actions = [ + {}, + {query: {range: {gpa: {gt: 3.9}}}}, + {}, + {query: {range: {gpa: {lt: 3.1}}}} +] +response = client.msearch(index: index_name, body: actions) + +puts 'Multi search results:' +puts MultiJson.dump(response, pretty: "true") + +# Boolean query +query = { + 'query': { + 'bool': { + 'filter': { + 'term': { + 'grad_year': 2022 + + } + } + } + }, + 'sort': { + 'last_name': { + 'order': 'asc' + } + } +} + +response = client.search(index: index_name, from: 0, size: 10, body: query) + +puts 'Boolean query search results:' +puts MultiJson.dump(response, pretty: "true") + +# Delete the index +puts 'Deleting the index:' +response = client.indices.delete(index: index_name) + +puts MultiJson.dump(response, pretty: "true") +``` +{% include copy.html %} + +# Ruby AWS Sigv4 Client + +The [opensearch-aws-sigv4](https://github.com/opensearch-project/opensearch-ruby-aws-sigv4) gem provides the `OpenSearch::Aws::Sigv4Client` class, which has all features of `OpenSearch::Client`. The only difference between these two clients is that `OpenSearch::Aws::Sigv4Client` requires an instance of `Aws::Sigv4::Signer` during instantiation to authenticate with AWS: + +```ruby +require 'opensearch-aws-sigv4' +require 'aws-sigv4' + +signer = Aws::Sigv4::Signer.new(service: 'es', + region: 'us-west-2', + access_key_id: 'key_id', + secret_access_key: 'secret') + +client = OpenSearch::Aws::Sigv4Client.new({ log: true }, signer) + +client.cluster.health + +client.transport.reload_connections! + +client.search q: 'test' +``` +{% include copy.html %} \ No newline at end of file diff --git a/_clients/rust.md b/_clients/rust.md new file mode 100644 index 00000000..36d1d24d --- /dev/null +++ b/_clients/rust.md @@ -0,0 +1,416 @@ +--- +layout: default +title: Rust client +nav_order: 100 +--- + +# Rust client + +The OpenSearch Rust client lets you connect your Rust application with the data in your OpenSearch cluster. For the client's complete API documentation and additional examples, see the [OpenSearch docs.rs documentation](https://docs.rs/opensearch/). + +This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [opensearch-rs repo](https://github.com/opensearch-project/opensearch-rs). + +## Setup + +If you're starting a new project, add the `opensearch` crate to Cargo.toml: + +```rust +[dependencies] +opensearch = "1.0.0" +``` +{% include copy.html %} + +Additionally, you may want to add the following `serde` dependencies that help serialize types to JSON and deserialize JSON responses: + +```rust +serde = "~1" +serde_json = "~1" +``` +{% include copy.html %} + +The Rust client uses the higher-level [`reqwest`](https://crates.io/crates/reqwest) HTTP client library for HTTP requests, and reqwest uses the [`tokio`](https://crates.io/crates/tokio) platform to support asynchronous requests. If you are planning to use asynchronous functions, you need to add the `tokio` dependency to Cargo.toml: + +```rust +tokio = { version = "*", features = ["full"] } +``` +{% include copy.html %} + +See the [Sample program](#sample-program) section for the complete Cargo.toml file. + +To use the Rust client API, import the modules, structs, and enums you need: + +```rust +use opensearch::OpenSearch; +``` +{% include copy.html %} + +## Connecting to OpenSearch + +To connect to the default OpenSearch host, create a default client object that connects to OpenSearch at the address `http://localhost:9200`: + +```rust +let client = OpenSearch::default(); +``` +{% include copy.html %} + +To connect to an OpenSearch host that is running at a different address, create a client with the specified address: + +```rust +let transport = Transport::single_node("http://localhost:9200")?; +let client = OpenSearch::new(transport); +``` +{% include copy.html %} + +Alternatively, you can customize the URL and use a connection pool by creating a `TransportBuilder` struct and passing it to `OpenSearch::new` to create a new instance of the client: + +```rust +let url = Url::parse("http://localhost:9200")?; +let conn_pool = SingleNodeConnectionPool::new(url); +let transport = TransportBuilder::new(conn_pool).disable_proxy().build()?; +let client = OpenSearch::new(transport); +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Service + +The following example illustrates connecting to Amazon OpenSearch Service: + +```rust +let url = Url::parse("https://..."); +let service_name = "es"; +let conn_pool = SingleNodeConnectionPool::new(url?); +let region_provider = RegionProviderChain::default_provider().or_else("us-east-1"); +let aws_config = aws_config::from_env().region(region_provider).load().await.clone(); +let transport = TransportBuilder::new(conn_pool) + .auth(aws_config.clone().try_into()?) + .service_name(service_name) + .build()?; +let client = OpenSearch::new(transport); +``` +{% include copy.html %} + +## Connecting to Amazon OpenSearch Serverless + +The following example illustrates connecting to Amazon OpenSearch Serverless Service: + +```rust +let url = Url::parse("https://..."); +let service_name = "aoss"; +let conn_pool = SingleNodeConnectionPool::new(url?); +let region_provider = RegionProviderChain::default_provider().or_else("us-east-1"); +let aws_config = aws_config::from_env().region(region_provider).load().await.clone(); +let transport = TransportBuilder::new(conn_pool) + .auth(aws_config.clone().try_into()?) + .service_name(service_name) + .build()?; +let client = OpenSearch::new(transport); +``` +{% include copy.html %} + + +## Creating an index + +To create an OpenSearch index, use the `create` function of the `opensearch::indices::Indices` struct. You can use the following code to construct a JSON object with custom mappings: + +```rust +let response = client + .indices() + .create(IndicesCreateParts::Index("movies")) + .body(json!({ + "mappings" : { + "properties" : { + "title" : { "type" : "text" } + } + } + })) + .send() + .await?; +``` +{% include copy.html %} + +## Indexing a document + +You can index a document into OpenSearch using the client's `index` function: + +```rust +let response = client + .index(IndexParts::IndexId("movies", "1")) + .body(json!({ + "id": 1, + "title": "Moneyball", + "director": "Bennett Miller", + "year": "2011" + })) + .send() + .await?; +``` +{% include copy.html %} + +## Performing bulk operations + +You can perform several operations at the same time by using the client's `bulk` function. First, create the JSON body of a Bulk API call, and then pass it to the `bulk` function: + +```rust +let mut body: Vec> = Vec::with_capacity(4); + +// add the first operation and document +body.push(json!({"index": {"_id": "2"}}).into()); +body.push(json!({ + "id": 2, + "title": "Interstellar", + "director": "Christopher Nolan", + "year": "2014" +}).into()); + +// add the second operation and document +body.push(json!({"index": {"_id": "3"}}).into()); +body.push(json!({ + "id": 3, + "title": "Star Trek Beyond", + "director": "Justin Lin", + "year": "2015" +}).into()); + +let response = client + .bulk(BulkParts::Index("movies")) + .body(body) + .send() + .await?; +``` +{% include copy.html %} + +## Searching for documents + +The easiest way to search for documents is to construct a query string. The following code uses a `multi_match` query to search for "miller" in the title and director fields. It boosts the documents where "miller" appears in the title field: + +```rust +response = client + .search(SearchParts::Index(&["movies"])) + .from(0) + .size(10) + .body(json!({ + "query": { + "multi_match": { + "query": "miller", + "fields": ["title^2", "director"] + } + } + })) + .send() + .await?; +``` +{% include copy.html %} + +You can then read the response body as JSON and iterate over the `hits` array to read all the `_source` documents: + +```rust +let response_body = response.json::().await?; +for hit in response_body["hits"]["hits"].as_array().unwrap() { + // print the source document + println!("{}", serde_json::to_string_pretty(&hit["_source"]).unwrap()); +} +``` +{% include copy.html %} + +## Deleting a document + +You can delete a document using the client's `delete` function: + +```rust +let response = client + .delete(DeleteParts::IndexId("movies", "2")) + .send() + .await?; +``` +{% include copy.html %} + +## Deleting an index + +You can delete an index using the `delete` function of the `opensearch::indices::Indices` struct: + +```rust +let response = client + .indices() + .delete(IndicesDeleteParts::Index(&["movies"])) + .send() + .await?; +``` +{% include copy.html %} + +## Sample program + +The sample program uses the following Cargo.toml file with all dependencies described in the [Setup](#setup) section: + +```rust +[package] +name = "os_rust_project" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +opensearch = "1.0.0" +tokio = { version = "*", features = ["full"] } +serde = "~1" +serde_json = "~1" +``` +{% include copy.html %} + +The following sample program creates a client, adds an index with non-default mappings, inserts a document, performs bulk operations, searches for the document, deletes the document, and then deletes the index: + +```rust +use opensearch::{DeleteParts, OpenSearch, IndexParts, http::request::JsonBody, BulkParts, SearchParts}; +use opensearch::{indices::{IndicesDeleteParts, IndicesCreateParts}}; +use serde_json::{json, Value}; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let client = OpenSearch::default(); + + // Create an index + let mut response = client + .indices() + .create(IndicesCreateParts::Index("movies")) + .body(json!({ + "mappings" : { + "properties" : { + "title" : { "type" : "text" } + } + } + })) + .send() + .await?; + + let mut successful = response.status_code().is_success(); + + if successful { + println!("Successfully created an index"); + } + else { + println!("Could not create an index"); + } + + // Index a single document + println!("Indexing a single document..."); + response = client + .index(IndexParts::IndexId("movies", "1")) + .body(json!({ + "id": 1, + "title": "Moneyball", + "director": "Bennett Miller", + "year": "2011" + })) + .send() + .await?; + + successful = response.status_code().is_success(); + + if successful { + println!("Successfully indexed a document"); + } + else { + println!("Could not index document"); + } + + // Index multiple documents using the bulk operation + + println!("Indexing multiple documents..."); + + let mut body: Vec> = Vec::with_capacity(4); + + // add the first operation and document + body.push(json!({"index": {"_id": "2"}}).into()); + body.push(json!({ + "id": 2, + "title": "Interstellar", + "director": "Christopher Nolan", + "year": "2014" + }).into()); + + // add the second operation and document + body.push(json!({"index": {"_id": "3"}}).into()); + body.push(json!({ + "id": 3, + "title": "Star Trek Beyond", + "director": "Justin Lin", + "year": "2015" + }).into()); + + response = client + .bulk(BulkParts::Index("movies")) + .body(body) + .send() + .await?; + + let mut response_body = response.json::().await?; + successful = response_body["errors"].as_bool().unwrap() == false; + + if successful { + println!("Successfully performed bulk operations"); + } + else { + println!("Could not perform bulk operations"); + } + + // Search for a document + + println!("Searching for a document..."); + response = client + .search(SearchParts::Index(&["movies"])) + .from(0) + .size(10) + .body(json!({ + "query": { + "multi_match": { + "query": "miller", + "fields": ["title^2", "director"] + } + } + })) + .send() + .await?; + + response_body = response.json::().await?; + for hit in response_body["hits"]["hits"].as_array().unwrap() { + // print the source document + println!("{}", serde_json::to_string_pretty(&hit["_source"]).unwrap()); + } + + // Delete a document + + response = client + .delete(DeleteParts::IndexId("movies", "2")) + .send() + .await?; + + successful = response.status_code().is_success(); + + if successful { + println!("Successfully deleted a document"); + } + else { + println!("Could not delete document"); + } + + // Delete the index + + response = client + .indices() + .delete(IndicesDeleteParts::Index(&["movies"])) + .send() + .await?; + + successful = response.status_code().is_success(); + + if successful { + println!("Successfully deleted the index"); + } + else { + println!("Could not delete the index"); + } + + Ok(()) +} +``` +{% include copy.html %} \ No newline at end of file diff --git a/_config.yml b/_config.yml index f0bc0c42..bf14b0e2 100644 --- a/_config.yml +++ b/_config.yml @@ -1,4 +1,4 @@ -title: OpenSearch documentation +title: OpenSearch Documentation description: >- # this means to ignore newlines until "baseurl:" Documentation for OpenSearch, the Apache 2.0 search, analytics, and visualization suite with advanced security, alerting, SQL support, automated index management, deep performance analysis, and more. # baseurl: "/docs/latest" # the subpath of your site, e.g. /blog @@ -6,9 +6,10 @@ baseurl: "" url: "https://opensearch.ossez.com" # the base hostname & protocol for your site, e.g. http://example.com permalink: /:path/ -opensearch_version: 1.1.0 -opensearch_major_minor_version: 1.1 -lucene_version: 8_9_0 +opensearch_version: '2.12.0' +opensearch_dashboards_version: '2.12.0' +opensearch_major_minor_version: '2.12' +lucene_version: '9_9_2' # Build settings markdown: kramdown @@ -28,82 +29,231 @@ color_scheme: opensearch # Define Jekyll collections collections: # Define a collection named "tests", its documents reside in the "_tests" directory + install-and-configure: + permalink: /:collection/:path/ + output: true upgrade-to: permalink: /:collection/:path/ output: true - opensearch: - permalink: /:collection/:path/ - output: true - dashboards: - permalink: /:collection/:path/ - output: true - security-plugin: - permalink: /:collection/:path/ - output: true - search-plugins: - permalink: /:collection/:path/ - output: true im-plugin: permalink: /:collection/:path/ output: true - replication-plugin: + ingest-pipelines: permalink: /:collection/:path/ output: true - monitoring-plugins: + dashboards: + permalink: /:collection/:path/ + output: true + integrations: + permalink: /:collection/:path/ + output: true + tuning-your-cluster: + permalink: /:collection/:path/ + output: true + security: + permalink: /:collection/:path/ + output: true + security-analytics: + permalink: /:collection/:path/ + output: true + search-plugins: + permalink: /:collection/:path/ + output: true + ml-commons-plugin: + permalink: /:collection/:path/ + output: true + tuning-your-cluster: + permalink: /:collection/:path/ + output: true + monitoring-your-cluster: + permalink: /:collection/:path/ + output: true + observing-your-data: + permalink: /:collection/:path/ + output: true + reporting: + permalink: /:collection/:path/ + output: true + analyzers: + permalink: /:collection/:path/ + output: true + query-dsl: + permalink: /:collection/:path/ + output: true + aggregations: + permalink: /:collection/:path/ + output: true + field-types: permalink: /:collection/:path/ output: true clients: permalink: /:collection/:path/ output: true + benchmark: + permalink: /:collection/:path/ + output: true + data-prepper: + permalink: /:collection/:path/ + output: true + tools: + permalink: /:collection/:path/ + output: true + api-reference: + permalink: /:collection/:path/ + output: true troubleshoot: permalink: /:collection/:path/ output: true external_links: permalink: /:collection/:path/ output: true + developer-documentation: + permalink: /:collection/:path/ + output: true + about: + permalink: /:collection/:path/ + output: true + automating-configurations: + permalink: /:collection/:path/ + output: true + dashboards-assistant: + permalink: /:collection/:path/ + output: true -just_the_docs: +opensearch_collection: # Define the collections used in the theme collections: + about: + name: About OpenSearch + nav_fold: true + install-and-configure: + name: Install and upgrade + nav_fold: true upgrade-to: - name: Upgrade to OpenSearch + name: Migrate to OpenSearch # nav_exclude: true nav_fold: true # search_exclude: true - opensearch: - name: OpenSearch + im-plugin: + name: Managing Indexes + nav_fold: true + ingest-pipelines: + name: Ingest Pipelines nav_fold: true dashboards: name: OpenSearch Dashboards nav_fold: true - security-plugin: - name: Security plugin + integrations: + name: OpenSearch Integrations + nav_fold: true + tuning-your-cluster: + name: Creating and tuning your cluster + nav_fold: true + security: + name: Security in OpenSearch + nav_fold: true + security-analytics: + name: Security analytics + nav_fold: true + field-types: + name: Mappings and field types + nav_fold: true + analyzers: + name: Text analysis + nav_fold: true + query-dsl: + name: Query DSL + nav_fold: true + aggregations: + name: Aggregations nav_fold: true search-plugins: - name: Search plugins + name: Search nav_fold: true - im-plugin: - name: Index management plugin + ml-commons-plugin: + name: Machine learning nav_fold: true - replication-plugin: - name: Replication plugin + automating-configurations: + name: Automating configurations nav_fold: true - monitoring-plugins: - name: Monitoring plugins + monitoring-your-cluster: + name: Monitoring your cluster nav_fold: true - clients: - name: Clients and tools + observing-your-data: + name: Observability + nav_fold: true + reporting: + name: Reporting + nav_fold: true + tools: + name: Tools + nav_fold: true + api-reference: + name: API reference nav_fold: true troubleshoot: name: Troubleshooting nav_fold: true - external_links: - name: External links + developer-documentation: + name: Developer documentation + nav_fold: true +clients_collection: + collections: + clients: + name: Clients + nav_fold: true + +benchmark_collection: + collections: + benchmark: + name: OpenSearch Benchmark + nav_fold: true + +data_prepper_collection: + collections: + data-prepper: + name: Data Prepper + nav_fold: true + +# Defaults + +defaults: + - + scope: + path: "" # an empty string here means all files in the project + values: + section: "opensearch" + section-name: "OpenSearch and OpenSearch Dashboards" + - + scope: + path: "_data-prepper" + values: + section: "data-prepper" + section-name: "Data Prepper" + - + scope: + path: "_clients" + values: + section: "clients" + section-name: "Clients" + - + scope: + path: "_benchmark" + values: + section: "benchmark" + section-name: "Benchmark" # Enable or disable the site search -# Supports true (default) or false -search_enabled: true +# By default, just-the-docs enables its JSON file-based search. We also have an OpenSearch-driven search functionality. +# To disable any search from appearing, both `search_enabled` and `use_custom_search` need to be false. +# To use the OpenSearch-driven search, `search_enabled` has to be false and `use_custom_search` needs to be true. +# If `search_enabled` is true, irrespective of the value of `use_custom_search`, the JSON file-based search appears. +# +# `search_enabled` defaults to true +# `use_custom_search` defaults to false +search_enabled: false +use_custom_search: true search: # Split pages into sections that can be searched individually @@ -140,13 +290,23 @@ heading_anchors: false # Adds on-hover anchor links to h2-h6 anchor_links: true +# This setting governs including warning on every page +# 'unsupported' produces red warning, 'supported' produces yellow warning +# everything else produces no warning +doc_version: latest + footer_content: plugins: + - jekyll-last-modified-at - jekyll-remote-theme - jekyll-redirect-from - jekyll-sitemap +# This format has to conform to RFC822 +last-modified-at: + date-format: '%a, %d %b %Y %H:%M:%S %z' + # Exclude from processing. # The following items will not be processed, by default. Create a custom list # to override the default setting. @@ -159,3 +319,5 @@ exclude: - vendor/gems/ - vendor/ruby/ - README.md + - .idea + - templates diff --git a/_dashboards/branding.md b/_dashboards/branding.md new file mode 100644 index 00000000..f4e4c1f4 --- /dev/null +++ b/_dashboards/branding.md @@ -0,0 +1,181 @@ +--- +layout: default +title: Custom branding +nav_order: 130 +--- + +# Custom branding +Introduced 1.2 +{: .label .label-purple } + +By default, OpenSearch Dashboards uses the OpenSearch logo, but if you want to use custom branding elements such as the favicon or main Dashboards logo, you can do so by editing `opensearch_dashboards.yml` or by including a custom `opensearch_dashboards.yml` file when you start your OpenSearch cluster. + +For example, if you're using Docker to start your OpenSearch cluster, include the following lines in the `opensearch-dashboards` section of your `docker-compose.yml` file: + +``` +volumes: + - ./opensearch_dashboards.yml:/usr/share/opensearch-dashboards/config/opensearch_dashboards.yml +``` + +Doing so replaces the Docker image's default `opensearch_dashboards.yml` with your custom `opensearch_dashboards.yml` file, so be sure to include your desired settings as well. For example, if you want to configure TLS for OpenSearch Dashboards, see [Configure TLS for OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/install/tls). + +Re-launch OpenSearch Dashboards, and OpenSearch Dashboards now uses your custom elements. + +## Branding elements + +The following elements in OpenSearch Dashboards are customizable: + +![OpenSearch customizable branding elements]({{site.url}}{{site.baseurl}}/images/dashboards-branding-labels.png) + +Setting | Corresponding branding element +:--- | :--- +logo | Header logo. See #1 in the image. +mark | OpenSearch Dashboards mark. See #2 in the image. +loadingLogo | Loading logo used when OpenSearch Dashboards is starting. See #3 in the image. +faviconUrl | Website icon. Loads next to the application title. See #4 in the image. +applicationTitle | The application's title. See #5 in the image. + +To consolidate navigation controls and reduce the space the header takes up on the page, see [Condensed header](#condensed-header). +{: .note} + +To start using your own branding elements in OpenSearch Dashboards, first uncomment this section of `opensearch_dashboards.yml`: + +```yml +# opensearchDashboards.branding: + # logo: + # defaultUrl: "" + # darkModeUrl: "" + # mark: + # defaultUrl: "" + # darkModeUrl: "" + # loadingLogo: + # defaultUrl: "" + # darkModeUrl: "" + # faviconUrl: "" + # applicationTitle: "" +``` + +Add the URLs you want to use as branding elements to the appropriate setting. Valid image types are `SVG`, `PNG`, and `GIF`. + +Customization of dark mode Dashboards is also available, but you first must supply a valid link to `defaultUrl`, and then link to your preferred image with `darkModeUrl`. If you don't provide a `darkModeUrl` link, then Dashboards uses the provided `defaultUrl` element for dark mode. You are not required to customize all branding elements, so if you wanted to, it's perfectly valid to change just the logo or any other element. Leave unchanged elements as commented. + +The following example demonstrates how to use `SVG` files as logos but leaves the other elements as defaults. + +```yml +logo: + defaultUrl: "https://example.com/validUrl.svg" + darkModeUrl: "https://example.com/validDarkModeUrl.svg" +# mark: +# defaultUrl: "" +# darkModeUrl: "" +# loadingLogo: +# defaultUrl: "" +# darkModeUrl: "" +# faviconUrl: "" +applicationTitle: "My custom application" +``` + +We recommend linking to images that are hosted on a web server, but if you really want to use locally hosted images, save your images inside `assets`, and then configure `opensearch_dashboards.yml` to use the correct paths. You can access locally stored images through the `ui/assets` folder. + +The following example assumes the default port of 5601 that Dashboards uses and demonstrates how to link to locally stored images. + +```yml +logo: + defaultUrl: "https://localhost:5601/ui/assets/my-own-image.svg" + darkModeUrl: "https://localhost:5601/ui/assets/dark-mode-my-own-image.svg" +mark: + defaultUrl: "https://localhost:5601/ui/assets/my-own-image2.svg" + darkModeUrl: "https://localhost:5601/ui/assets/dark-mode-my-own-image2.svg" +# loadingLogo: +# defaultUrl: "" +# darkModeUrl: "" +# faviconUrl: "" +applicationTitle: "My custom application" +``` + +### Condensed header + +The condensed header view reduces the footprint of the header and frees up space on the page by combining navigational elements into a single header bar. + +The current default view remains close in appearance to the two-bar header offered in the previous version of Dashboards, with minor differences. To specify the condensed header, add the configuration property `useExpandedHeader` to the `opensearch_dashboards.yml` file and set the value to `false`, as the following example illustrates. + + ```yml +# opensearchDashboards.branding: + # logo: + defaultUrl: "https://example.com/sample.svg" + darkModeUrl: "https://example.com/dark-mode-sample.svg" + # mark: + # defaultUrl: "" + # darkModeUrl: "" + # loadingLogo: + # defaultUrl: "" + # darkModeUrl: "" + # faviconUrl: "" + applicationTitle: "my custom application" + useExpandedHeader: false +``` + +In a future release, default behavior will become `useExpandedHeader: false`. If you want to retain the default view in subsequent releases, you can explicitly set the property to `true` in advance. Alternatively, you can also do this when upgrading. +{: .note } + +The condensed view header appears as in the example below. + +![Condensed header]({{site.url}}{{site.baseurl}}/images/DBs-Condensed.jpeg) + +Header element | Description +:--- | :--- +OpenSearch logo | See #1. Functions as the home button. +Header bar | See #2. A single header bar used for all navigation controls. + +The default view remains close to the traditional view, with minor changes. + +![Default header]({{site.url}}{{site.baseurl}}/images/DBs-Traditional.jpeg) + +Header element | Description +:--- | :--- +Home button | See #1. Returns to the home page and provides an indication when a page is loading. +Header label | See #2. The label also functions as a home button. +Navigation controls | See #3. Additional navigation controls on right-side insertion points. + +#### Preserving nagivation elements in the default view + +You can continue using the top header bar in the default view for custom navigation links (such as menu items and plugins). Follow the steps below to keep these elements in the top header in the default view. +1. Replace the property `coreStart.chrome.navControls.registerRight(...)` with `coreStart.chrome.navControls.registerExpandedRight(...)` and then replace the property `coreStart.chrome.navControls.registerCenter(...)` with `coreStart.chrome.navControls.registerExpandedCenter(...)` + +2. Make sure the configuration property `useExpandedHeader` is explicitly set to `true`. + + +## Sample configuration + +The following configuration enables the Security plugin and SSL within OpenSearch Dashboards and uses custom branding elements to replace the OpenSearch logo and application title. + +```yml +server.host: "0" +opensearch.hosts: ["https://localhost:9200"] +opensearch.ssl.verificationMode: none +opensearch.username: "kibanaserver" +opensearch.password: "kibanaserver" +opensearch.requestHeadersAllowlist: [ authorization,securitytenant ] +#server.ssl.enabled: true +#server.ssl.certificate: /path/to/your/server/certificate +#server.ssl.key: /path/to/your/server/key + +opensearch_security.multitenancy.enabled: true +opensearch_security.multitenancy.tenants.preferred: ["Private", "Global"] +opensearch_security.readonly_mode.roles: ["kibana_read_only"] +# Use this setting if you are running opensearch-dashboards without https +opensearch_security.cookie.secure: false + +opensearchDashboards.branding: + logo: + defaultUrl: "https://example.com/sample.svg" + darkModeUrl: "https://example.com/dark-mode-sample.svg" + # mark: + # defaultUrl: "" + # darkModeUrl: "" + # loadingLogo: + # defaultUrl: "" + # darkModeUrl: "" + # faviconUrl: "" + applicationTitle: "Just some testing" +``` diff --git a/_dashboards/browser-compatibility.md b/_dashboards/browser-compatibility.md deleted file mode 100644 index 9848db46..00000000 --- a/_dashboards/browser-compatibility.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -layout: default -title: Browser compatibility -parent: OpenSearch Dashboards -nav_order: 3 ---- - -# Browser compatibility - -OpenSearch Dashboards supports the following web browsers: - -- Chrome -- Firefox -- Safari -- Edge (Chromium) - -Other Chromium-based browsers might work, as well. Internet Explorer and Microsoft Edge Legacy are **not** supported. diff --git a/_dashboards/dashboard/index.md b/_dashboards/dashboard/index.md new file mode 100644 index 00000000..37878bd2 --- /dev/null +++ b/_dashboards/dashboard/index.md @@ -0,0 +1,142 @@ +--- +layout: default +title: Creating dashboards +nav_order: 30 +has_children: false +--- + +# Creating dashboards + +The **Dashboard** application in OpenSearch Dashboards lets you visually represent your analytical, operational, and strategic data to help you quickly understand the trends in your data, giving you a high-level view of key metrics, simplifying data exploration, and delivering insights when and where you need them. + +In this tutorial you'll learn the basics of creating a dashboard using the **Dashboard** application and OpenSearch sample data. The sample dataset has existing sample visualizations, and you can use those visualizations or create new visualizations for the dashboard. In this tutorial, you'll do both. Once you've completed this tutorial, you'll have learned the foundations of creating a new dashboard with multiple panels in OpenSearch Dashboards. + +This OpenSearch Playground [dashboard example](https://playground.opensearch.org/app/dashboards#/view/722b74f0-b882-11e8-a6d9-e546fe2bba5f?_g=(filters:!(),refreshInterval:(pause:!f,value:900000),time:(from:now-7d,to:now))&_a=(description:'Analyze%20mock%20eCommerce%20orders%20and%20revenue',filters:!(),fullScreenMode:!f,options:(hidePanelTitles:!f,useMargins:!t),query:(language:kuery,query:''),timeRestore:!t,title:'%5BeCommerce%5D%20Revenue%20Dashboard',viewMode:view)) shows you what's possible with OpenSearch Dashboards. +{: .note} + +## Getting familiar with the UI + +Before getting started, let's get familiar with the **Dashboard** UI. The UI comprises the following main components: + +![Dashboard user interface]({{site.url}}{{site.baseurl}}/images/dashboards/dashboard-UI.png) + +- The **navigation panel** (A) on the left contains the OpenSearch Dashboards applications. +- The **search** bar (B) lets you search for documents and other objects and add filters. +- The **filter** (C) lets you narrow a dashboard's results. +- The **toolbar** (D) contains frequently used commands and shortcuts. +- The **time filter** (E) lets you customize the time and date. +- The **panel** (F) allows you to add existing visualizations to the dashboard or create new ones for the dashboard. + +## Defining terminology + +The following is some useful terminology for working with OpenSearch Dashboards and the **Dashboard** application: + +- _Dashboards_ is the abbreviated name for OpenSearch Dashboards. OpenSearch Dashboards is an open-source visualization tool designed to work with OpenSearch. +- _Dashboard_ is the OpenSearch Dashboards application used to track, analyze, and display data. +- _dashboard_ or _dashboards_ are common names for a tool used to visually display data. +- _Panel_ is a term used to refer to a visualization displayed on a dashboard. The terms _panel_ and _visualization_ may be used interchangeably throughout this and other Dashboards documentation. + +The following tutorial assumes you're either using your existing installation of OpenSearch Dashboards or using the [OpenSearch Playground](https://playground.opensearch.org/app/home#/). Depending on which one you use, certain capabilities may not be available. For example, sample datasets may not be included in your existing installation, and saving a dashboard isn't an option in the OpenSearch Playground. +{: .note} + +## Creating a dashboard and adding an existing visualization + +To create a dashboard and add a sample visualization: + +1. Connect to `https://localhost:5601`. The username and password are `admin`. Alternatively, go to the [OpenSearch Playground](https://playground.opensearch.org/app/home#/). +1. On the top menu, go to **OpenSearch Dashboards > Dashboard**. +1. From the **Dashboards** panel, choose **Create Dashboard**. +1. Choose the calendar icon and set the time filter to **Last 30 days**. +1. From the panel, choose **Add an existing**. +1. From the **Add panels** window, choose **[eCommerce] Promotion Tracking**, and then choose `x` to close the panel. + +You've now created the following basic dashboard with a single panel, which you'll continue using throughout this tutorial. + +![Basic dashboard with single panel]({{site.url}}{{site.baseurl}}/images/dashboards/dashboard-basic.png) + +## Creating visualizations + +Continuing with the dashboard you created in the preceding steps, you'll create a new visualization and save it to the dashboard: + +1. From the dashboard toolbar, choose **Create new**. +1. From the **New Visualization** window, choose **Gauge** and then select the index pattern **opensearch_dashboards_sample_data_ecommerce**. +1. From the toolbar, choose **Save**. +1. In the **Save visualization** window, enter a title for the visualization. For example, the title for the gauge chart panel is [eCommerce] Orders. +1. Choose **Save and return**. + +The gauge chart visualization is now saved and you are taken back to the dashboard. You'll see two visualizations on the dashboard, like the following. + +![Dashboard showing visualizations combined in a single view]({{site.url}}{{site.baseurl}}/images/dashboards/dashboard-combined.png) + +## Adding subsequent panels + +Continuing with the dashboard you created in the preceding steps, you'll add an existing visualization to the dashboard: + +1. From the dashboard toolbar, choose **Add**. +1. From the **Add panels** window, choose **[eCommerce] Sales by Category**. +1. Choose `x` to close the **Add panels** window. + +You'll see an area chart visualization display on the dashboard, as shown in the following image. + +![Adding another panel to the dashboard]({{site.url}}{{site.baseurl}}/images/dashboards/dashboard-adding-panels.png) + +## Saving dashboards + +When you've finalized your dashboard, save it. If you're saving a new dashboard: + +1. In the toolbar, choose **Save**. +2. In the **Save dashboard** window, enter the **Title**. The **Description** is optional. +3. To save the time filter to the dashboard, select **Store time with dashboard**. +4. Choose **Save**. + +## Customizing the look of a panel + +To customize the panels, you'll need to be in edit mode: + +- Choose **Edit** at the top right of the toolbar. + +If you see **Create new** at the top right of the toolbar, you're already in edit mode. +{: .note} + +Displaying a legend can give readers more information, while hiding a legend can give the panel a cleaner look. If you want to display or hide the panel legend: + +- Choose the list icon in the panel's lower left corner. + +If you want to change the color of the panel legend: + +- From the visualization legend, select a category and then select a color from the flyout. The area chart updates with your change. + +This color change is only saved for the current panel and dashboard and doesn't affect the saved visualization. +{: .note} + +If you want to change the color of the panel legend in the visualization: + +1. Choose the gear icon on the area chart panel. +2. From the **Options** window, select **Edit visualization**. +3. From the visualization legend, select a category and then select a color from the flyout. The area chart updates with your change. +4. Choose **Save and return**. + +This color change affects the saved visualization and any dashboard that links to the visualization. +{: .note} + +If you want to display, hide, or customize the panel title: + +1. Choose the gear icon on the panel. +2. From the **Options** window, select **Edit panel title**. +3. From the **Customize panel**, enter a title under **Panel title** or toggle the **Show panel title** to hide the title. +4. Choose **Save**. + +Changing panel titles only affects the particular panel on the particular dashboard and won't affect any other panel containing that same visualization or any other dashboard. +{: .note} + +## Arranging panels + +To organize panels, arrange them side by side, or resize them, you can use these options: + +- To move a panel, select and hold the panel title or the top of the panel and drag to the new location. +- To resize a panel, choose the resize icon in the panel's lower-right corner and drag to the new dimensions. +- To view a panel in full screen mode, choose the gear icon (edit mode) or vertical ellipsis (⋮) at the top right of the panel and select **Maximize panel**. To minimize the full screen mode, choose the gear icon or vertical ellipsis and select **Minimize**. + +The following is an example of a customized dashboard created by using this tutorial. + +![Customized dashboard with panels arranged side by side and without legends]({{site.url}}{{site.baseurl}}/images/dashboards/dashboard-customized.png) diff --git a/_dashboards/dashboard/plugins-dashboards.md b/_dashboards/dashboard/plugins-dashboards.md new file mode 100644 index 00000000..a4a0f6eb --- /dev/null +++ b/_dashboards/dashboard/plugins-dashboards.md @@ -0,0 +1,43 @@ +--- +layout: default +title: Integrating plugins into a dashboard +parent: Observability +nav_order: 5 +--- + +# Integrating plugins into a dashboard + +Observability is a collection of plugins and applications that let you visualize data-driven events by using [Piped Processing Language]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/) to explore, discover, and query data stored in OpenSearch. Observability provides a unified experience for collecting and monitoring metrics, logs, and traces from common data sources. With data collection and monitoring in one place, you have full-stack, end-to-end observability of your entire infrastructure. + +As of OpenSearch 2.7, you can manage your observability plugins with **Observability Dashboards** or **Dashboard** instead of the plugins page. This feature provides you: + +- **Instant access to installed plugins:** The dashboard displays all installed plugins in one place. +- **Improved efficiency:** With a list of plugins readily available from a dashboard, you can enable, disable, update, or remove plugins in the OpenSearch Dashboards UI. +- **Better troubleshooting:** Viewing a list of plugins from a dashboard can help you quickly identify which plugins may be causing a problem. +- **Enhanced security:** With a list of plugins readily available from a dashboard, you can easily see if any outdated or vulnerable plugins are present and then quickly remove or update them, minimizing or avoiding security risks. +- **Improved website performance:** Viewing a list of plugins from a dashboard can help you identify any plugins that may be slowing down your website or causing performance issues. + +Get familiar with the basics of managing plugins from the Dashboard app in less than 20 seconds in the following video. + +![Demo of using Dashboard to view a list of observability plugins](https://user-images.githubusercontent.com/105296784/234345611-50beb9a6-6118-449a-b015-b9f9e90b525e.gif) + +## Viewing a list of installed plugins + +To view a list of installed plugins from the Dashboard app, follow these steps: + +1. From the OpenSearch Dashboards main menu, select **Dashboard**. +2. View the list of items and select your plugin. Plugins are categorized automatically as the Observability Dashboard data type, which you can filter in order to concentrate on just what you want to see. + +## Adding and removing plugins + +To add a plugin from the Dashboard app, follow these steps: + +1. From the OpenSearch Dashboards main menu, select **Dashboard**. +2. In the **Dashboards** window, select **Create** > **Dashboard**. +3. In the **Create operational panel** window, enter a name in the **Name** field and then select **Create**. The plugin is added to both the Observability app and the Dashboard app. + +You can remove a plugin from the Dashboard app by selecting the edit icon under the **Actions** column and then selecting **Delete**. + +## Staying updated about OpenSearch Dashboards plugins + +The [OpenSearch plugins repository](https://github.com/opensearch-project/opensearch-plugins) on GitHub is a great way to keep track of and contribute to tasks, features, enhancements, and bugs. The OpenSearch Project team welcomes your input. diff --git a/_dashboards/dashboards-assistant/index.md b/_dashboards/dashboards-assistant/index.md new file mode 100644 index 00000000..dd62347c --- /dev/null +++ b/_dashboards/dashboards-assistant/index.md @@ -0,0 +1,130 @@ +--- +layout: default +title: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 3 +has_children: false +has_toc: false +--- + +This is an experimental feature and is not recommended for use in a production environment. For updates on the feature's progress or to leave feedback, go to the [`dashboards-assistant` repository](https://github.com/opensearch-project/dashboards-assistant) on GitHub or the associated [OpenSearch forum thread](https://forum.opensearch.org/t/feedback-opensearch-assistant/16741). +{: .warning} + +Note that machine learning models are probabilistic and that some may perform better than others, so the OpenSearch Assistant may occasionally produce inaccurate information. We recommend evaluating outputs for accuracy as appropriate to your use case, including reviewing the output or combining it with other verification factors. +{: .important} + +# OpenSearch Assistant for OpenSearch Dashboards +Introduced 2.12 +{: .label .label-purple } + +The OpenSearch Assistant toolkit helps you create AI-powered assistants for OpenSearch Dashboards without requiring you to have specialized query tools or skills. + +## Enabling OpenSearch Assistant + +To enable **OpenSearch Assistant** in OpenSearch Dashboards, locate your copy of the `opensearch_dashboards.yml` file and set the following option: + +``` +assistant.chat.enabled: true +``` +{% include copy-curl.html %} + +Then configure the root `agent_id` through the following API: + +``` +PUT .plugins-ml-config/_doc/os_chat +{ + "type":"os_chat_root_agent", + "configuration":{ + "agent_id": "your root agent id" + } +} +``` +{% include copy-curl.html %} + +This example shows a system index. In security-enabled domains, only super admins have permission to execute this code. For information about making super admin calls, see the [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/) guide. For access permission, contact your IT administrator. +{: .warning} + +Next, restart the OpenSearch Dashboards server. Following a successful restart, **OpenSearch Assistant** appears in the OpenSearch Dashboards interface. + +A screenshot of the interface is shown in the following image. + +OpenSearch Assistant interface + +For more information about ways to enable experimental features, see [Experimental feature flags]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/experimental/). +{: .note} + +## Configuring OpenSearch Assistant + +You can use the OpenSearch Dashboards interface to configure OpenSearch Assistant. Go to the [Getting started guide](https://github.com/opensearch-project/dashboards-assistant/blob/main/GETTING_STARTED_GUIDE.md) for step-by-step instructions. For the chatbot template, go to the [Flow Framework plugin](https://github.com/opensearch-project/flow-framework) documentation. You can modify this template to use your own model and customize the chatbot tools. + +For information about configuring OpenSearch Assistant through the REST API, see [OpenSearch Assistant Toolkit]({{site.url}}{{site.baseurl}}/ml-commons-plugin/opensearch-assistant/). + +## Using OpenSearch Assistant in OpenSearch Dashboards + +The following tutorials guide you through using OpenSearch Assistant in OpenSearch Dashboards. OpenSearch Assistant can be viewed full frame or in the right sidebar. The default is sidebar. To view full frame, select the frame icon {::nomarkdown}frame icon{:/} in the toolbar. + +### Start a conversation + +Start a conversation by entering a prompt in the **Ask a question** search box or by using the shortcut `ctrl + /`. Select **Go** to initiate the conversation. A response is generated. + +The following screenshot shows an example prompt and response. + +Prompt and response using OpenSearch Assistant in OpenSearch Dashboards + +### Regenerate a response + +Beneath the response, select the regenerate icon to generate an alternative answer to your original question. The new answer will replace the previous one, appearing in both the interface and the chat history. A regenerated example is shown in the following image. + +Regenerated response + +### Suggested prompts + +OpenSearch Assistant suggests prompts to help you get started, build upon your existing prompts, or explore other queries you may not have considered, among other reasons. Select a suggested prompt listed beneath the response field. A screenshot is shown in the following image. + +Suggested prompts + +### Rate a response + +To rate a response, select the thumbs up or thumbs down icon. A screenshot of the interface is shown in the following image. The feedback is stored in the `additional_info` field of the message index. + +### Response generation + +Learn how a response is generated by selecting the **How was this generated?** option. This option is included within the available suggestions to help you understand which tools were involved in creating the response. If multiple tools were involved, each step will display the tool name and its input and output. This feature can be useful for troubleshooting. A screenshot is shown in the following image. + +Response generation details + +### Resume previous conversations + +To view a previous conversation, select the clock icon to open the conversation history panel and display the chat history. The conversation history can also be searched by conversation name. A screenshot is shown in the following image. + +Conversation history + +#### Edit and delete previous conversations + +Select the pencil icon to edit a conversation name and rename it. Select the **Confirm name** button to save the new name. A screenshot is shown in the following image. + +Editing a conversation name + +Select the trash can icon to delete a conversation. Once the confirmation dialog appears, select **Delete conversation**. The conversation is now deleted from your chat history. A screenshot is shown in the following image. + +Deleting a conversation + +### Share a conversation through Notebooks + +You can use [Notebooks]({{site.url}}{{site.baseurl}}/observing-your-data/notebooks/) to save your conversations. To use this option, select **Save to notebook** from the dropdown menu to the right of **OpenSearch Assistant**. Enter a name for the notebook, then select **Save**. A pop-up message in the lower-right corner confirms the conversation has been saved. + +All conversations (prompts and responses/questions and answers) between you and the large language model (LLM) will be saved to this notebook. + +To open the saved notebook or view a list of other notebooks, select **Observability** > **Notebooks** from the OpenSeach Dashboards navigation menu. + +A screenshot of the Notebooks interface with a list of saved conversations is shown in the following image. + +Notebooks interface with saved OpenSearch Assistant conversations + +The following screenshot shows a saved conversation, along with actions you can take for the saved conversation. + +Notebooks interface with saved OpenSearch Assistant conversations + +## Related articles + +- [Getting started guide for OpenSearch Assistant in OpenSearch Dashboards](https://github.com/opensearch-project/dashboards-assistant/blob/main/GETTING_STARTED_GUIDE.md) +- [OpenSearch Assistant configuration through the REST API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/opensearch-assistant/) diff --git a/_dashboards/dev-tools/index-dev.md b/_dashboards/dev-tools/index-dev.md new file mode 100644 index 00000000..814c2320 --- /dev/null +++ b/_dashboards/dev-tools/index-dev.md @@ -0,0 +1,21 @@ +--- +layout: default +title: Dev Tools +nav_order: 120 +has_children: true +--- + +# Dev Tools + +**Dev Tools** is a development environment that lets you set up your OpenSearch Dashboards environment, run queries, explore data, and debug problems. You can use the Dev Tools console to: + +- **Set up your OpenSearch Dashboards environment.** For example, you can use the console to configure authentication settings for your OpenSearch Dashboards instance. +- **[Run queries to explore your data]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/run-queries/).** For example, you can use the console to tune your queries for relevance. +- **Debug problems with your queries.** For example, if your query is not returning the results you expect, you can use the console to identify the problem. +- **Learn about the APIs in OpenSearch.** For example, you can use the API reference documentation linked in the console to look up the syntax for different API calls (select the question circle icon ({::nomarkdown}question circle icon{:/})). +- **Develop custom visualizations.** For example, you can use the console to create [Vega visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/#vega). +- **Customize the appearance and behavior of dashboards.** For example, you can use the console to customize dashboard visualization colors or to add new filters. + +To access the console, go to the OpenSearch Dashboards main menu and select **Management** > **Dev Tools**. An example is shown in the following image. + +Dev Tools console interface diff --git a/_dashboards/dev-tools/run-queries.md b/_dashboards/dev-tools/run-queries.md new file mode 100644 index 00000000..7f92de9f --- /dev/null +++ b/_dashboards/dev-tools/run-queries.md @@ -0,0 +1,66 @@ +--- +layout: default +title: Running queries in the Dev Tools console +parent: Dev Tools +nav_order: 10 +redirect_from: + - /dashboards/run-queries/ +--- + +# Running queries in the Dev Tools console + +The Dev Tools console can be used to send queries to OpenSearch. To access the console, go to the OpenSearch Dashboards main menu and select **Management** > **Dev Tools**. +## Writing queries + +OpenSearch provides a query domain-specific language (DSL) called [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/). It is a flexible language with a JSON interface. + +To write your queries, use the editor pane on the left side of the console. To send a query to OpenSearch, select the query by placing the cursor in the query text and then selecting the play icon ({::nomarkdown}play icon{:/}) on the upper right of the request or press `Ctrl/Cmd+Enter`. The response from OpenSearch is displayed in the response pane on the right side of the console. To run multiple commands simultaneously, select all the commands in the editor pane, and then select the play icon or press `Ctrl/Cmd+Enter`. + +An example of the query and response panes is shown in the following image. + +Console UI with query and request + +### Query options + +When writing queries using the console, there are common actions that can help you write queries more efficiently and accurately. The following table describes these features and how you can use them. + +Feature | How to use | +--------|------------| +**Collapsing or expanding a query** | To hide or show details of your query, select the expander arrow ({::nomarkdown}arrow down icon{:/}) next to the line number. | +**Auto indenting** | To use auto indent, select the queries that you want to format, then select the wrench icon ({::nomarkdown}wrench icon{:/}), and choose **Auto indent**. | +**Autocomplete** | To define your preferences for autocomplete suggestions, configure them in **Settings**. | +**Request history** | To view request history, select **History** from the top menu. If you select the request you want to view from the left pane, the query is shown in the right pane. To copy the query into the editor pane, select the query text and then select **Apply**. To clear the history, select **Clear**. | +**Keyboard shortcuts** | To view all available keyboard shortcuts, select **Help** from the top menu. | +**Documentation access from the console** | To access OpenSearch documentation from the console, select the wrench icon ({::nomarkdown}wrench icon{:/}) and choose **Open documentation**. | + +## Working in the cURL and console formats + +The console uses a simplified syntax to format REST requests instead of the `curl` command. If you paste a `curl` command directly into the console, the command is automatically converted into the format used by the console. To import a query in cURL format, select the query, then select the wrench icon ({::nomarkdown}wrench icon{:/}), and choose **Copy as cURL**. + +For example, the following `curl` command runs a search query: + +```bash +curl -XGET http://localhost:9200/shakespeare/_search?pretty -H 'Content-Type: application/json' -d' +{ + "query": { + "match": { + "text_entry": "To be, or not to be" + } + } +}' +``` +{% include copy.html %} + +The same query has a simplified syntax in the console format, as shown in the following example: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": "To be, or not to be" + } + } +} +``` +{% include copy-curl.html %} diff --git a/_dashboards/discover/index-discover.md b/_dashboards/discover/index-discover.md new file mode 100644 index 00000000..4e32c4f1 --- /dev/null +++ b/_dashboards/discover/index-discover.md @@ -0,0 +1,113 @@ +--- +layout: default +title: Analyzing data +nav_order: 20 +has_children: true +redirect_from: + - /dashboards/discover/index-discover/ +--- + +# Analyzing data + +To analyze your data in OpenSearch and visualize key metrics, you can use the **Discover** application in OpenSearch Dashboards. An example of data analysis in **Discover** is shown in the following image. + +A Discover default page + +## Getting started + +In this tutorial, you'll learn about using **Discover** to: + +- Add data. +- Interpret and visualize data. +- Share data findings. +- Set alerts. + +Before getting started, make sure you: + +- Install [OpenSearch Dashboards](https://opensearch.org/downloads.html). +- Add sample data or import your own data into OpenSearch. Go to the [OpenSearch Dashboards quickstart guide]({{site.url}}{{site.baseurl}}/dashboards/quickstart/) to learn about adding sample datasets. Go to [Managing indexes]({{site.url}}{{site.baseurl}}/im-plugin/index/) to learn about importing your own data. +- Have a foundational understanding of [OpenSearch documents and indexes]({{site.url}}{{site.baseurl}}/im-plugin/index/). + +## Defining the search + +To define a search, follow these steps: + +1. On the OpenSearch Dashboards navigation menu, select **Discover**. +2. Choose the data you want to work with. In this case, choose `opensearch_dashboards_sample_data_flights` from the upper-left dropdown menu. +3. Select the {::nomarkdown}calendar icon{:/} icon to change the time range of your search and then select **Refresh**. + +The resulting view is shown in the following image. + +Discover interface showing search of flight sample data for Last 7 days + +## Analyzing document tables + +In OpenSearch, a document table stores unstructured data. In a document table, each row represents a single document, and each column contains document attributes. + +To examine document attributes, follow these steps: + +1. From the data table's left column, choose the {::nomarkdown}inspect icon{:/} icon to open the **Document Details** window. Select the {::nomarkdown}minimize icon{:/} icon to close the **Document Details** window. +2. Examine the metadata. You can switch between the **Table** and **JSON** tabs to view the data in your preferred format. +3. Select **View surrounding documents** to view data for other log entries either preceding or following your current document or select **View single document** to view a particular log entry. + +The resulting view is shown in the following image. + +Document attributes + +To add or delete fields in a document table, follow these steps: + +1. View the data fields listed under **Available fields** and select the {::nomarkdown}plus icon{:/} icon to add the desired fields to the document table. The field will be automatically added to both **Selected fields** and the document table. For this example, choose the fields `Carrier`, `AvgTicketPrice`, and `Dest`. +2. Select **Sort fields** > **Pick fields to sort by**. Drag and drop the chosen fields in the desired sort order. + +The resulting view is shown in the following image. + +Adding and deleting data fields + +## Searching data + +You can use the search toolbar to enter a [DQL]({{site.url}}{{site.baseurl}}/dashboards/discover/dql/) or [query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) query. The search toolbar is best for basic queries; for full query and filter capability, use [query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/index/) in the [Dev Tools console]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/index-dev/). + +For more information, see [Discover and Dashboard search toolbar]({{site.url}}{{site.baseurl}}/dashboards/index/#discover-and-dashboard-search-bar). + +## Filtering data + +Filters allow you to narrow the results of a query by specifying certain criteria. You can filter by field, value, or range. The **Add filter** pop-up suggests the available fields and operators. + +To filter your data, follow these steps: + +1. Under the DQL search bar, choose **Add filter**. +2. Select the desired options from the **Field**, **Operator**, and **Value** dropdown lists. For example, select `Cancelled`, `is`, and `true`. +3. Choose **Save**. +4. To remove a filter, choose the {::nomarkdown}cross icon{:/} icon to the right of the filter name. + +The resulting view is shown in the following image. + +Visualize data findings interface + +## Saving a search + +To save your search, including the query text, filters, and current data view, follow these steps: + +1. Select **Save** on the upper-right toolbar. +2. Add a title, and then choose **Save**. +3. Select **Open** on the upper-right toolbar to access your saved searches. + +## Visualizing data findings + +To visualize your data findings, follow these steps: + +1. Select the {::nomarkdown}inspect icon{:/} icon to the right of the field you want to visualize. + + The resulting view is shown in the following image. + + Visualize data findings interface + +2. Select the **Visualize** button. When the **Visualize** application is launched, a visualization appears. + + The resulting view is shown in the following image. + + Data visualization of flight sample data field destination + +## Setting alerts + +Set alerts to notify you when your data exceeds your specified thresholds. Go to [Alerting dashboards and visualizations]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/dashboards-alerting/) to learn about creating and managing alerts. diff --git a/_dashboards/discover/time-filter.md b/_dashboards/discover/time-filter.md new file mode 100644 index 00000000..288138d0 --- /dev/null +++ b/_dashboards/discover/time-filter.md @@ -0,0 +1,36 @@ +--- +layout: default +title: Time filter +parent: Analyzing data +nav_order: 20 +redirect_from: + - /dashboards/get-started/time-filter/ + - /dashboards/discover/time-filter/ +--- + +# Time filter + +The time filter is used to set the time range, including minutes, hours, days, weeks, months, or years, that is displayed on your dashboard. + +The default time range is **Last 15 minutes**. You can change the time range at the dashboard level or under **Dashboards Management** > **Advanced Settings** > **Time filter defaults**. +{: .note} + +To change the time range at the dashboard level, follow these steps: + +1. From an OpenSearch Dashboards application (Discover, Dashboards, or Visualize), select the {::nomarkdown}calendar icon{:/} icon on the right of the search bar. +2. Select one of the time filter options, as shown in the following image: + - **Quick select:** Choose a time based on the last or next number of seconds, minutes, hours, days, or another time unit. + - **Commonly used:** Choose a common time range like **Today**, **Last 7 days**, or **Last 30 days**. + - **Recently used date ranges:** Select a previously used time range. + - **Refresh every:** Set an automatic refresh period. + + Time range interface + +3. Choose **Show dates** to set start and end times, and then select anywhere inside the toolbar to access the time filter pop-up window, as shown in the following image. + + Time filter pop-up window + +4. Select **Absolute**, **Relative**, or **Now** and specify ranges. +5. Choose **Update** to apply changes, as shown in the following image. + + Start and end times interface diff --git a/_dashboards/dql.md b/_dashboards/dql.md index 3e71145f..7ddcbc6d 100644 --- a/_dashboards/dql.md +++ b/_dashboards/dql.md @@ -1,142 +1,369 @@ --- layout: default -title: Dashboards query language -nav_order: 99 +title: Dashboards Query Language (DQL) +nav_order: 130 +redirect_from: + - /dashboards/dql/ + - /dashboards/discover/dql/ --- -# Dashboards Query Language +# Dashboards Query Language (DQL) -Similar to the [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index) that lets you use the HTTP request body to search for data, you can use the Dashbaords Query Language (DQL) in OpenSearch Dashboards to search for data and visualizations. +Dashboards Query Language (DQL) is a simple text-based query language used to filter data in OpenSearch Dashboards. For example, to display your site visitor data for a host in the United States, you would enter `geo.dest:US` in the search field, as shown in the following image. -For example, if you want to see all visualizations of visits to a host based in the US, enter `geo.dest:US` into the search field, and Dashboards refreshes to display all related data. +Search term using DQL toolbar in Dashboard -Just like the query DSL, DQL has a handful of query types, so use whichever best fits your use case. +DQL and query string query (Lucene) language are the two search bar language options in Discover and Dashboards. To compare these language options, see [Discover and Dashboard search bar]({{site.url}}{{site.baseurl}}/dashboards/index/#discover-and-dashboard-search-bar). +{: .tip} -This section uses the OpenSearch Dashboards sample web log data. To add sample data in Dashboards, log in to OpenSearch Dashboards, choose **Home**, **Add sample data**, and then **Add data**. +## Setup ---- +To follow this tutorial in OpenSearch Dashboards, expand the following setup steps. -#### Table of contents -1. TOC -{:toc} +
+ + Setup + + {: .text-delta} ---- +Use the following steps to prepare sample data for querying. -## Terms query +**Step 1: Set up mappings for the index** -The most basic query is to just specify the term you're searching for. +On the main menu, select **Management** > **Dev Tools** to open [Dev Tools]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/run-queries/). Send the following request to create index mappings: +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "date" : { + "type" : "date", + "format" : "yyyy-MM-dd" + } + } + } +} ``` -host:www.example.com +{% include copy-curl.html %} + +**Step 2: Ingest the documents into the index** + +In **Dev Tools**, ingest the following documents into the index: + +```json +PUT /testindex/_doc/1 +{ + "title": "The wind rises", + "description": "A biographical film", + "media_type": "film", + "date": "2013-07-20", + "page_views": 100 +} +``` +{% include copy-curl.html %} + +```json +PUT /testindex/_doc/2 +{ + "title": "Gone with the wind", + "description": "A well-known 1939 American epic historical film", + "media_type": "film", + "date": "1939-09-09", + "page_views": 200 +} +``` +{% include copy-curl.html %} + +```json +PUT /testindex/_doc/3 +{ + "title": "Chicago: the historical windy city", + "media_type": "article", + "date": "2023-07-29", + "page_views": 300 +} +``` +{% include copy-curl.html %} + +```json +PUT /testindex/_doc/4 +{ + "article title": "Wind turbines", + "media_type": "article", + "format": "2*3" +} +``` +{% include copy-curl.html %} + +**Step 3: Create an index pattern** + +Follow these steps to create an index pattern for your index: + +1. On the main menu, select **Management** > **Dashboards Management**. +1. Select **Index patterns** and then **Create index pattern**. +1. In **Index pattern name**, enter `testindex*`. Select **Next step**. +1. In **Time field**, select `I don't want to use the time filter`. +1. Select **Create index pattern**. + +For more information about index patterns, see [Index patterns]({{site.url}}{{site.baseurl}}/dashboards/management/index-patterns/). + +**Step 4: Navigate to Discover and select the index pattern** + +On the main menu, select **Discover**. In the upper-left corner, select `testindex*` from the **Index patterns** dropdown list. The main panel displays the documents in the index, and you can now try out the DQL queries described on this page. + +The [Object fields](#object-fields) and [Nested fields](#nested-fields) sections provide links for additional setup needed to try queries in those sections. +{: .note} +
+ +## Search for terms + +By default, DQL searches in the field set as the default field on the index. If the default field is not set, DQL searches all fields. For example, the following query searches for documents containing the words `rises` or `wind` in any of their fields: + +```python +rises wind +``` +{% include copy.html %} + +The preceding query matches documents in which any search term appears regardless of the order. By default, DQL combines search terms with an `or`. To learn how to create Boolean expressions containing search terms, see [Boolean operators](#boolean-operators). + +To search for a phrase (an ordered sequence of words), surround your text with quotation marks. For example, the following query searches for the exact text "wind rises": + +```python +"wind rises" +``` +{% include copy.html %} + +Hyphens are reserved characters in Lucene, so if your search term contains hyphens, DQL might prompt you to switch to Lucene syntax. To avoid this, surround your search term with quotation marks in a phrase search or omit the hyphen in a regular search. +{: .tip} + +## Reserved characters + +The following is a list of reserved characters in DQL: + +`\`, `(`, `)`, `:`, `<`, `>`, `"`, `*` + +Use a backslash (`\`) to escape reserved characters. For example, to search for an expression `2*3`, specify the query as `2\*3`: + +```plaintext +2\*3 +``` +{% include copy.html %} + +## Search in a field + +To search for text in a particular field, specify the field name before the colon: + +```python +title: rises wind +``` +{% include copy.html %} + +The analyzer for the field you're searching parses the query text into tokens and matches documents in which any of the tokens appear. + +DQL ignores white space characters, so `title:rises wind` and `title: rises wind` are the same. +{: .tip} + +Use wildcards to refer to field names containing spaces. For example, `article*title` matches the `article title` field. +{: .tip} + +## Field names + +Specify the field name before the colon. The following table contains example queries with field names. + +Query | Criterion for a document to match | Matching documents from the `testindex` index +:--- | :--- | :--- +`title: wind` | The `title` field contains the word `wind`. | 1, 2 +`title: (wind OR windy)` | The `title` field contains the word `wind` or the word `windy`. | 1, 2, 3 +`title: "wind rises"` | The `title` field contains the phrase `wind rises`. | 1 +`title.keyword: The wind rises` | The `title.keyword` field exactly matches `The wind rises`. | 1 +`title*: wind` | Any field that starts with `title` (for example, `title` and `title.keyword`) contains the word `wind` | 1, 2 +`article*title: wind` | The field that starts with `article` and ends with `title` contains the word `wind`. Matches the field `article title`. | 4 +`description:*` | Documents in which the field `description` exists. | 1, 2 + +## Wildcards + +DQL supports wildcards (`*` only) in both search terms and field names, for example: + +```python +t*le: *wind and rise* +``` +{% include copy.html %} + +## Ranges + +DQL supports numeric inequalities using the `>`, `<`, `>=`, and `<=` operators, for example: + +```python +page_views > 100 and page_views <= 300 +``` +{% include copy.html %} + +You can use the range operators on dates. For example, the following query searches for documents containing dates within the 2013--2023 range, inclusive: + +```python +date >= "2013-01-01" and date < "2024-01-01" +``` +{% include copy.html %} + +You can query for "not equal to" by using `not` and the field name, for example: + +```python +not page_views: 100 +``` +{% include copy.html %} + +Note that the preceding query returns documents in which either the `page_views` field does not contain `100` or the field is not present. To filter by those documents that contain the field `page_views`, use the following query: + +```python +page_views:* and not page_views: 100 +``` +{% include copy.html %} + +## Boolean operators + +DQL supports the `and`, `or`, and `not` Boolean operators. DQL is not case sensitive, so `AND` and `and` are the same. For example, the following query is a conjunction of two Boolean clauses: + +```python +title: wind and description: epic +``` +{% include copy.html %} + +Boolean operators follow the logical precedence order of `not`, `and`, and `or`, so in the following example, `title: wind and description: epic` is evaluated first: + +```python +media_type: article or title: wind and description: epic +``` +{% include copy.html %} + +To dictate the order of evaluation, group Boolean clauses in parentheses. For example, in the following query, the parenthesized expression is evaluated first: + +```python +(media_type: article or title: wind) and description: epic +``` +{% include copy.html %} + +The field prefix refers to the token that immediately follows the colon. For example, the following query searches for documents in which the `title` field contains `windy` or documents containing the word `historical` in any of their fields: + +```python +title: windy or historical +``` +{% include copy.html %} + +To search for documents in which the `title` field contains `windy` or `historical`, group the terms in parentheses: + +```python +title: (windy or historical) +``` +{% include copy.html %} + +The preceding query is equivalent to `title: windy or title: historical`. + +To negate a query, use the `not` operator. For example, the following query searches for documents that contain the word `wind` in the `title` field, are not of the `media_type` `article`, and do not contain `epic` in the `description` field: + +```python +title: wind and not (media_type: article or description: epic) +``` +{% include copy.html %} + +Queries can contain multiple grouping levels, for example: + +```python +title: ((wind or windy) and not rises) +``` +{% include copy.html %} + +## Object fields + +To refer to an object's inner field, list the dot path of the field. + +To index a document containing an object, follow the steps in the [object field type example]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object/#example). To search the `name` field of the `patient` object, use the following syntax: + +```python +patient.name: john +``` +{% include copy.html %} + +## Nested fields + +To refer to a nested object, list the JSON path of the field. + +To index a document containing an object, follow the steps in the [nested field type example]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/#nested-field-type-1). + +To search the `name` field of the `patients` object, use the following syntax: + +```python +patients: {name: john} +``` +{% include copy.html %} + +To retrieve documents that match multiple fields, specify all the fields. For example, consider an additional `status` field in the following document: + +```json +{ + "status": "Discharged", + "patients": [ + {"name" : "John Doe", "age" : 56, "smoker" : true}, + {"name" : "Mary Major", "age" : 85, "smoker" : false} + ] +} ``` -To access an object's nested field, list the complete path to the field separated by periods. For example, to retrieve the `lat` field in the `coordinates` object: +To search for a discharged patient whose name is John, specify the `name` and the `status` in the query: +```python +patients: {name: john} and status: discharged ``` -coordinates.lat:43.7102 +{% include copy.html %} + +You can combine multiple Boolean and range queries to create a more refined query, for example: + +```python +patients: {name: john and smoker: true and age < 57} ``` +{% include copy.html %} -DQL also supports leading and trailing wildcards, so you can search for any terms that match your pattern. +## Doubly nested fields -``` -host.keyword:*.example.com/* -``` - -To check if a field exists or has any data, use a wildcard to see if Dashboards returns any results. - -``` -host.keyword:* -``` - -## Boolean query - -To mix and match, or even combine, multiple queries for more refined results, you can use the boolean operators `and`, `or`, and `not`. DQL is not case sensitive, so `AND` and `and` are the same. - -``` -host.keyword:www.example.com and response.keyword:200 -``` - -The following example demonstrates how to use multiple operators in one query. - -``` -geo.dest:US or response.keyword:200 and host.keyword:www.example.com -``` - -Remember that boolean operators follow the logical precedence order of `not`, `and`, and `or`, so if you have an expression like the previous example, `response.keyword:200 and host.keyword:www.example.com` gets evaluated first, and then Dashboards uses that result to compare with `geo.dest:US`. - -To avoid confusion, we recommend using parentheses to dictate the order you want to evaluate in. If you want to evaluate `geo.dest:US or response.keyword:200` first, your expression becomes: - -``` -(geo.dest:US or response.keyword:200) and host.keyword:www.example.com -``` - -## Date and range queries - -DQL also supports inequalities if you're using numeric inequalities. - -``` -bytes >= 15 and memory < 15 -``` - -Similarly, you can use the same method to find a date before or after your query. `>` indicates a search for a date after your specified date, and `<` returns dates before. - -``` -@timestamp > "2020-12-14T09:35:33" -``` - -## Nested field query - -If you have a document with nested fields, you have to specify which parts of the document you want to retrieve. - -Suppose that you have the following document: +Consider a document with a doubly nested field. In this document, both the `patients` and `names` fields are of type `nested`: ```json { - "superheroes":[ + "patients": [ { - "hero-name": "Superman", - "real-identity": "Clark Kent", - "age": 28 - }, - { - "hero-name": "Batman", - "real-identity": "Bruce Wayne", - "age": 26 - }, - { - "hero-name": "Flash", - "real-identity": "Barry Allen", - "age": 28 - }, - { - "hero-name": "Robin", - "real-identity": "Dick Grayson", - "age": 15 + "names": [ + { "name": "John Doe", "age": 56, "smoker": true }, + { "name": "Mary Major", "age": 85, "smoker": false} + ] } ] } ``` -The following example demonstrates how to use DQL to retrieve a specific field. +To search the `name` field of the `patients` object, use the following syntax: +```python +patients: {names: {name: john}} ``` -superheroes: {hero-name: Superman} +{% include copy.html %} + +In contrast, consider a document in which the `patients` field is of type `object` but the `names` field is of type `nested`: + +```json +{ + "patients": + { + "names": [ + { "name": "John Doe", "age": 56, "smoker": true }, + { "name": "Mary Major", "age": 85, "smoker": false} + ] + } +} ``` -If you want to retrieve multiple objects from your document, just specify all of the fields you want to retrieve. +To search the `name` field of the `patients` object, use the following syntax: +```python +patients.names: {name: john} ``` -superheroes: {hero-name: Superman} and superheroes: {hero-name: Batman} -``` - -The previous boolean and range queries still work, so you can submit a more refined query. - -``` -superheroes: {hero-name: Superman and age < 50} -``` - -If your document has an object nested within another object, you can still retrieve data by specifying all of the levels. - -``` -justice-league.superheroes: {hero-name:Superman} -``` +{% include copy.html %} \ No newline at end of file diff --git a/_dashboards/im-dashboards/component-templates.md b/_dashboards/im-dashboards/component-templates.md new file mode 100644 index 00000000..4e60c0e5 --- /dev/null +++ b/_dashboards/im-dashboards/component-templates.md @@ -0,0 +1,66 @@ +--- +layout: default +title: Component templates +parent: Index Management +nav_order: 50 +--- + +# Component templates +Introduced 2.7 +{: .label .label-purple } + +Component templates allow you to create a single index pattern that matches multiple indexes. This pattern can include wildcards or regular expressions, enabling you to apply the same setting or mapping to multiple indexes simultaneously. + +Using them with [index templates]({{site.url}}{{site.baseurl}}/im-plugin/index-templates/) can provide a powerful tool for managing large volumes of data. You can create an index template that defines the basic structure and settings of your indexes and then use the component templates to apply the settings to all indexes that match a specific pattern or set of criteria. + +You can create component templates using the Index Management UI. The UI maximizes ease of use for common indexing and data stream administrative operations such as create, read, update, delete (CRUD) and mapping indexes; CRUD and mapping aliases; reindexing; and open/close, shrink, and split indexes, along with the monitoring of actions and logging of audit records. + +The following GIF demonstrates creating a component template. + +![Component template demo]({{site.url}}{{site.baseurl}}/images/admin-ui-index/component.gif) + +## Prerequisites + +This tutorial is intended for admin users who [manage OpenSearch clusters]({{site.url}}{{site.baseurl}}/tuning-your-cluster/cluster/) and are familiar with [index management in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/index/). + +## Key terms + +It's helpful to understand the following terms before starting this tutorial: + +- *Component template* refers to a reusable building block with settings, mappings, and aliases that can be attached to an index template. +- *Index template* refers to a predefined structure used to organize and store data in a database or search index. + +## Creating component templates using the Index Management UI + +You can use predefined OpenSearch Dashboards component templates or customize your own, either by creating original templates or by modifying existing templates. Predefined component templates include preconfigured charts, tables, and graphs and are a good starting point for users who are new to OpenSearch Dashboards. Alternatively, customized template components provide you with options for tailoring reports and visualizations that meet your specific requirements and preferences. + +To create template components using the UI, follow these steps: + +1. On the OpenSearch Dashboards main page, select **Index Management** in the navigation menu. +1. In the Index Management window, select **Templates** > **Component templates**. +1. Select **Create** and then define the component template settings. +1. To configure aliases, settings, and mappings, toggle **Use configuration**, as shown in the following image. + + ![Component template use configuration]({{site.url}}{{site.baseurl}}/images/admin-ui-index/component_use_config.png) + +1. Enter details in the aliases, settings, and mappings fields. +1. Select **Create component template**. + +When you create component templates, those templates apply only to new index templates that you create and not to existing index templates. +{: .note } + +## Associating component templates with index templates + +To associate a component template with an index template, follow these steps: + +1. In the **Index Management** navigation menu, select **Templates**. +1. In the Templates window, select **Create template**. +1. Select **Component template** as the method for defining your template. +1. In the **Component template** pane, select **Associate component template**, as shown in the following image. + + ![Component template associate configuration]({{site.url}}{{site.baseurl}}/images/admin-ui-index/associate_component.png) + +1. In the **Associate component template** pop-up window, select the component templates that you want to associate with your index template. +1. Select **Associate**. +1. Select **Preview template** to view the template settings. +1. Select **Create template**. \ No newline at end of file diff --git a/_dashboards/im-dashboards/datastream.md b/_dashboards/im-dashboards/datastream.md new file mode 100644 index 00000000..72fa4617 --- /dev/null +++ b/_dashboards/im-dashboards/datastream.md @@ -0,0 +1,115 @@ +--- +layout: default +title: Data streams +parent: Index Management +nav_order: 20 +redirect_from: + - /dashboards/admin-ui-index/datastream/ + - /opensearch/data-streams/ +--- + +# Data streams +Introduced 2.6 +{: .label .label-purple } + +In OpenSearch Dashboards, the **Index Management** application allows you to view and manage [data streams]({{site.url}}{{site.baseurl}}/im-plugin/data-streams/) as shown in the following image. + +![Data Streams]({{site.url}}{{site.baseurl}}/images/admin-ui-index/datastreams1.png) + +## Viewing a data stream + +To view a data stream and its health status, choose **Data streams** under **Index management** as shown in the following image. + +![Data Streams]({{site.url}}{{site.baseurl}}/images/admin-ui-index/datastreams5.png) + +The following are the three data stream health statuses: + +- Green: All primary and replica shards are assigned. +- Yellow: At least one replica shard is not assigned. +- Red: At least one primary shard is not assigned. + +## Creating a data stream + +To create a data stream, perform the following steps: + +1. Under **Index Management**, choose **Data streams**. + +1. Choose **Create data stream**. + +1. Enter a name for the data stream under **Data stream name**. + +1. Ensure that you have a matching index template. This will be populated under **Matching index template**, as shown in the following image. + + ![Data Streams]({{site.url}}{{site.baseurl}}/images/admin-ui-index/datastreams3.png) + +1. The **Inherited settings from template** and **Index alias** sections are read-only, and display the backing indexes that are contained in the data stream. + +1. The number of primary shards, number of replicas, and the refresh interval are inherited from the template, as shown in the following image. + + ![Data Streams]({{site.url}}{{site.baseurl}}/images/admin-ui-index/datastreams4.png) + +1. Choose **Create data stream**. + +## Deleting a data stream + +To delete a data stream, perform the following steps: + +1. Under **Index Management**, choose **Data streams**. + +1. Select the data stream that you want to delete. + +1. Choose **Actions**, and then choose **Delete**. + +## Rolling over a data stream + +To perform a rollover operation on a data stream, perform the following steps: + +1. Under **Index Management**, choose **Data streams**. + +1. Choose **Actions**, and then choose **Roll over**, as shown in the following image. + + ![Rollover]({{site.url}}{{site.baseurl}}/images/admin-ui-index/rollover1.png) + +1. Under **Configure source**, select the source data stream on which you want to perform the rollover operation. + +1. Choose **Roll over**, as shown in the following image. + + ![Rollover]({{site.url}}{{site.baseurl}}/images/admin-ui-index/rollover3.png) + +## Force merging data streams + +To perform a force merge operation on two or more indexes, perform the following steps: + +1. Under **Index Management**, choose **Data streams**. + +1. Select the data streams on which you want to perform the force merge operation. + +1. Choose **Actions**, and then choose **Force merge**. + +1. Under **Configure source index**, specify the data streams you want to force merge. + +1. Optionally, under **Advanced settings** you can to choose to **Flush indices** or **Only expunge delete** and then specify the **Max number of segments** to merge to as shown in the following image. + + ![Force Merge]({{site.url}}{{site.baseurl}}/images/admin-ui-index/forcemerge2.png) + +## Refreshing a data stream + +Refreshing a data stream makes new updates to the index visible to search operations. + +The refresh operation can be applied only to open indexes associated with the specified data streams. + +To refresh a data stream, select the data stream from the **Data streams** list under **Index Management**. Then select **Refresh** from the **Actions** dropdown list. + +## Flushing a data stream + +The flush operation performs a Lucene commit, writing segments to disk and starting a new translog. + +The flush operation can be applied only to open indexes associated with the specified data streams. + +To flush a data stream, select the data stream from the **Data streams** list under **Index Management**. Then select **Flush** from the **Actions** dropdown list. + +## Clearing a data stream cache + +The [clear cache operation]({{site.url}}{{site.baseurl}}/api-reference/index-apis/clear-index-cache/) can be applied only to open indexes associated with the specified data streams. + +To clear a data stream cache, select the index from the **Indices** list under **Index Management**. Then select **Clear cache** from the **Actions** dropdown list. \ No newline at end of file diff --git a/_dashboards/im-dashboards/forcemerge.md b/_dashboards/im-dashboards/forcemerge.md new file mode 100644 index 00000000..9373861b --- /dev/null +++ b/_dashboards/im-dashboards/forcemerge.md @@ -0,0 +1,48 @@ +--- +layout: default +title: Force merge +parent: Index Management +nav_order: 30 +redirect_from: + - /dashboards/admin-ui-index/forcemerge/ +--- + +# Force merge +Introduced 2.6 +{: .label .label-purple } + +OpenSearch Dashboards allows you to perform a [force merge]({{site.url}}{{site.baseurl}}/im-plugin/ism/error-prevention/index#force_merge) operation on two or more indexes with **Index Management**. + +## Force merging indexes + +To perform a force merge operation on two or more indexes, perform the following steps: + +1. Under **Index Management**, choose **Indices**. + +1. Select the indexes you want to force merge. + +1. Choose **Actions**, and then choose **Force merge**, as shown in the following image. + + ![Force Merge]({{site.url}}{{site.baseurl}}/images/admin-ui-index/forcemerge1.png) + +1. Under **Configure source index**, specify the indexes you want to force merge. + +1. Optionally, under **Advanced settings** you can to choose to **Flush indices** or **Only expunge delete** and then specify the **Max number of segments** to merge to as shown in the following image. + + ![Force Merge]({{site.url}}{{site.baseurl}}/images/admin-ui-index/forcemerge2.png) + +## Force merging data streams + +To perform a force merge operation on two or more indexes, perform the following steps: + +1. Under **Index Management**, choose **Data streams**. + +1. Select the data streams you want to force merge. + +1. Choose **Actions**, and then choose **Force merge**. + +1. Under **Configure source index**, specify the data streams you want to force merge. + +1. Optionally, under **Advanced settings** you can to choose to **Flush indices** or **Only expunge delete** and then specify the **Max number of segments** to merge to as shown in the following image. + + ![Force Merge]({{site.url}}{{site.baseurl}}/images/admin-ui-index/forcemerge2.png) diff --git a/_dashboards/im-dashboards/index-management.md b/_dashboards/im-dashboards/index-management.md new file mode 100644 index 00000000..5b0286e3 --- /dev/null +++ b/_dashboards/im-dashboards/index-management.md @@ -0,0 +1,302 @@ +--- +layout: default +title: Indexes +parent: Index Management +nav_order: 16 +redirect_from: + - /dashboards/admin-ui-index/index-management/ +--- + +# Indexes +Introduced 2.5 +{: .label .label-purple } + +In the **Index Management** section, you can perform the operations available in the [Index API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/index/). + +## Index policies + +[Policies]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/) are configurations that define the possible states of an index, the actions to perform when an index enters a given state, and the conditions that must be met to transition between states: + +1. **States**: The possible states of an index, including the default state for new indexes. For example, you might name your states `hot`, `warm`, or `delete`. For more information, see [States]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/#states). +2. **Actions**: Any actions that you want the plugin to take when an index enters a given state, such as performing a rollover. For more information, see [Actions]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/#actions). +3. **Transitions**: The conditions that must be met for an index to move into a new state. For example, if an index is more than 8 weeks old, you might want to move it to the `delete` state. For more information, see [Transitions]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/#transitions). + +You can also upload a JSON document to specify an index policy. +{: .note} + +You have complete flexibility in designing your policies. You can create any state, transition to any other state, and specify any number of actions in each state. + +To attach policies to indexes, perform the following steps: + +1. Under **Index Management**, choose **Index policies**. +2. Select the index or indexes to which you want to attach your policy. +3. Choose the **Apply policy** button. +4. From the **Policy ID** menu, select the policy that you created. + View the preview of your policy. +5. (Optional): Specify a rollover alias if your policy includes a rollover operation. Make sure that the alias already exists. For more information about the rollover operation, see [rollover]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies#rollover). +6. Choose the **Apply** button. + +After you attach a policy to an index, Index State Management (ISM) creates a job that runs every 5 minutes by default to perform policy actions, check conditions, and transition the index into different states. To change the default time interval for this job, see [Settings]({{site.url}}{{site.baseurl}}/im-plugin/ism/settings/). + +Policy jobs don't run if the cluster state is red. +{: .note} + +## Managed indexes + +To attach policies to indexes, perform the following steps: + +1. Under **Index Management**, choose **Manage Indices**. +2. Select the index or indexes to which you want to attach your policy. +3. Choose the **Change policy** button. +4. Choose the **Apply policy** button. + +## Indexes + +The **Indices** section displays a list of indexes in your OpenSearch cluster. For each index, you can see its health status (`green`, `yellow`, or `red`), policy (if the index is managed by a policy), status, total size, primary sizes, total documents, deleted documents, primaries, and replicas. + +The following are the three index health statuses: + +- Green: All primary and replica shards are assigned. +- Yellow: At least one replica shard is not assigned. +- Red: At least one primary shard is not assigned. + +### Creating an index + +While you can [create an index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/) by using a document as a base, you can also create an empty index for later use. + +To create an index, select the **Create Index** button located under the **Indices** section of **Index Management**. Then define the index by setting the following parameters: + +- Index name +- Number of primary shards +- Number of replicas +- Refresh interval + +You can also add fields and objects using either the visual editor or the JSON editor. + +The **Advanced settings** allow you to upload a JSON configuration. + +### Applying a policy + +If you analyze time-series data, you likely want to prioritize new data over old data. You might periodically perform certain operations on older indexes, such as reducing replica count or deleting them. + +[ISM]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) is a plugin that lets you automate these periodic administrative operations by triggering them based on changes in the index age, index size, or number of documents. You can define policies that automatically handle index rollovers or deletions to fit your use case. + +For example, you can define a policy that moves your index into a **read_only** state after 30 days and then deletes it after a set period of 90 days. You can also set up the policy to send you a notification message when the index is deleted. + +You might want to perform an index rollover after a certain amount of time or run a **force_merge** operation on an index during off-peak hours to improve search performance during peak hours. + +To apply a policy, select the index to which you want to apply the policy from the **Indices** list under **Index Management**. Then select the **Actions** button and select **Apply policy** from the dropdown list as shown in the following image. + +User interface showing apply policy prompt + +### Closing an index + +The [close index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/close-index/) operation closes an index. Once an index is closed, you cannot add data to it or search for any data within the index. + +To close an index, select the index you want to close from the **Indices** list under **Index Management**. Then select the **Actions** button and select **Close** from the dropdown list. + +### Opening an index + +The [open index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/open-index/) operation opens a closed index, letting you add data to it or search for data within the index. + +To open an index, select the index you want to open from the **Indices** list under **Index Management**. Then select the **Actions** button and select **Open** from the dropdown list. + +### Reindexing an index + +The [reindex]({{site.url}}{{site.baseurl}}/api-reference/document-apis/reindex/) operation lets you copy all of your data or a subset of data from a source index into a destination index. + +To reindex an index, select the index from the **Indices** list under **Index Management**. Then select the **Actions** button and select **Reindex** from the dropdown list as shown in the following image. + +User interface showing reindex prompt + +### Shrinking an index + +The [shrink]({{site.url}}{{site.baseurl}}/api-reference/index-apis/shrink-index/) index operation copies all of the data in an existing index into a new index with fewer primary shards. + +To shrink an index, select the index you want to shrink from the **Indices** list under **Index Management**. Then choose the **Actions** button and choose **Shrink** from the dropdown list as shown in the following image. + +User interface showing shrink prompt + +### Splitting an index + +The [split index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/split/) operation splits an existing read-only index into a new index, splitting each primary shard into a number of primary shards in the new index. + +To split an index, select the index you want to split from the **Indices** list under **Index Management**. Then choose the **Actions** button and choose **Split** from the dropdown list as shown in the following image. + +User interface showing split page + +### Refreshing an index + +Refreshing an index makes new updates to the index visible to search operations. + +The refresh operation can be applied only to open indexes. + +To refresh all indexes, select **Refresh** from the **Actions** dropdown list. + +To refresh a particular index, select the index from the **Indices** list under **Index Management**. Then select **Refresh** from the **Actions** dropdown list. + +### Flushing an index + +The flush operation performs a Lucene commit, writing segments to disk and starting a new translog. + +The flush operation can be applied only to open indexes. + +To flush all indexes, select **Flush** from the **Actions** dropdown list. + +To flush a particular index, select the index from the **Indices** list under **Index Management**. Then select **Flush** from the **Actions** dropdown list. + +### Clearing an index cache + +The [clear cache operation]({{site.url}}{{site.baseurl}}/api-reference/index-apis/clear-index-cache/) can be applied only to open indexes. + +To clear all index caches, select **Clear cache** from the **Actions** dropdown list. + +To clear a particular index cache, select the index from the **Indices** list under **Index Management**. Then select **Clear cache** from the **Actions** dropdown list. + +### Deleting an index + +If you no longer need an index, you can use the [delete index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/delete-index/) operation to delete it. + +To delete an index, select the index you want to delete from the **Indices** list under **Index Management**. Then select the **Actions** button and select **Delete** from the dropdown list. + +## Templates + +[Index templates]({{site.url}}{{site.baseurl}}/opensearch/index-templates/) let you initialize new indexes with predefined mappings and settings. For example, if you continuously index log data, you can define an index template so that all of the indexes have the same number of shards and replicas as shown in the following image. + +User interface showing Templates page + +### Creating a template + +To create a template, choose the **Create template** button on the **Templates** page under **Index Management**. + +Next, define the template: + +1. Enter the template name. +1. Select the template type. +1. Specify any index patterns you would like to use. +1. Set the priority of the template. +1. Select an index alias. +1. Set the number of primary shards. +1. Set the number of replicas. +1. Set the refresh intervals. +1. Add fields and objects for your index mapping using either the visual editor or the JSON editor. +1. Under **Advanced Settings** you can specify advanced index settings with a comma-delimited list as shown in the following image. + +User interface showing Create Template page + +### Editing a template + +To edit a template, select the template you want to edit from the list of templates. Next, select the **Actions** dropdown list and select the **Edit** option. + +### Deleting a template + +To delete a template, select the template you want to delete from the list of templates. Next, select the **Actions** dropdown list and select the **Delete** option. + +## Aliases + +An alias is a virtual index name that can point to one or more indexes. If your data is spread across multiple indexes, rather than keeping track of which indexes to query, you can create an alias and query it instead as shown in the following image. + +User interface showing Alias page + +### Creating an alias + +To create an alias, perform the following steps: + +1. Choose the **Create Alias** button on the **Aliases** page under **Index Management**. +2. Specify the alias name. +3. Enter the index, or index patterns, to be included in the alias. +4. Choose **Create alias** as shown in the following image. + +User interface showing create Alias page + +### Editing an alias + +To edit an alias, perform the following steps: + +1. Select the alias you want to edit. +2. Choose the **Actions** button. +3. Choose **Edit** from the dropdown list. + +### Deleting an alias + +To delete an alias, perform the following steps: + +1. Select the alias you want to edit. +2. Choose the **Actions** button. +3. Choose **Delete** from the dropdown list. + +### Refreshing an alias + +Refreshing an alias makes new updates to the index visible to search operations. + +The refresh operation can be applied only to open indexes associated with the specified aliases. + +To refresh a particular alias, select the alias from the **Aliases** list under **Index Management**. Then select **Refresh** from the **Actions** dropdown list. + +### Flushing an alias + +The flush operation performs a Lucene commit, writing segments to disk and starting a new translog. + +The flush operation can be applied only to open indexes associated with the specified aliases. + +To flush an alias, select the alias from the **Aliases** list under **Index Management**. Then select **Flush** from the **Actions** dropdown list. + +### Clearing an alias cache + +The [clear cache operation]({{site.url}}{{site.baseurl}}/api-reference/index-apis/clear-index-cache/) can be applied only to open indexes associated with the specified aliases. + +To clear an alias cache, select the alias from the **Aliases** list under **Index Management**. Then select **Clear cache** from the **Actions** dropdown list. + +## Rollup jobs + +The **Rollup Jobs** section under **Index Management** allows you to create or update index rollup jobs. + +To create a rollup job, perform the following steps: + +1. Choose the **Create rollup job** button on the **Rollup Jobs** page under **Index Management**. +2. Set the name, source index, and target index. +3. Choose **Next**. +4. Set the timestamp field and interval type. +5. Optionally, set additional aggregations and metrics. +6. Choose **Next**. +7. Under **Schedule**, check or uncheck **Enable job by default**. +8. Set the **Continuous**, **Execution frequency**, **Rollup interval**, and **Pages per execution** settings. +9. Additionally, you can set an execution delay. +10. Choose **Next**. +11. Review the settings for the rollup job and choose **Create**. + +You can also enable and disable rollup jobs by choosing the corresponding buttons on the **Rollup Jobs** page. + +## Transform jobs + +You can create, start, stop, and complete operations with [transform]({{site.url}}{{site.baseurl}}/im-plugin/index-transforms/transforms-apis/) jobs. + +To create a transform job, perform the following steps: + +1. Choose the **Create transform job** button on the **Transform Jobs** page under **Index Management**. +2. Set the name, source index, and target index. +3. Choose **Next**. +4. Select the fields to transform. From the table, select a field you want to transform by choosing **+** next to the field name. +5. Choose **Next**. +6. Check or uncheck **Job enabled by default**. +7. Set the transform execution interval and whether the schedule is continuous. +8. Optionally, set pages per execution under the **Advanced** dropdown list. +9. Choose **Next**. +10. Review the settings for the rollup job and choose **Create**. + +You can also enable and disable rollup jobs by choosing the corresponding buttons on the **Transform Jobs** page. + +## Long-running operation status check + +Certain index operations take additional time to complete (usually more than 30 seconds, but up to tens of minutes or hours). This is tracked in the index status column on the **Indices** page. + +You can check the status of the reindex, shrink, and split operations because they are one-time, non-recursive operations. + +## Security integration + + Permission control is managed with existing [permissions]({{site.url}}{{site.baseurl}}/security-plugin/access-control/permissions/) or action groups that are enforced at the API level. There is currently no UI-level permission control. Users with permission to access the ISM plugin are able to view new pages. They can also make changes if they have permission to run the related APIs. + +## Error handling + +Similar to API calls, if the operation fails immediately, you will be notified with an error message. However, if it is a long-running operation, you will be notified of the failure at the time of failure, or you can check the index status on the **Indices** page. + \ No newline at end of file diff --git a/_dashboards/im-dashboards/index.md b/_dashboards/im-dashboards/index.md new file mode 100644 index 00000000..9ad94f5f --- /dev/null +++ b/_dashboards/im-dashboards/index.md @@ -0,0 +1,52 @@ +--- +layout: default +title: Index Management +nav_order: 80 +has_children: true +redirect_from: + - /dashboards/admin-ui-index/ +--- + +# Index Management +Introduced 2.5 +{: .label .label-purple } + +The Index Management interface in OpenSearch Dashboards provides a unified solution for managing common indexing and data stream operations. The interface allows you to perform create, read, update, and delete (CRUD) and mapping operations for indexes, index templates, and aliases instead of using REST APIs or YAML configurations for basic administrative operations and interventions, along with other operations such as open, close, reindex, shrink, and split indexes. The interface also provides you with the capabilities to run index status and data validation before submitting requests and compare changes with previously saved settings before making updates. + +An example of the interface is shown in the following image. + +![Index Management user interface]({{site.url}}{{site.baseurl}}/images/dashboards/index-management-UI.png) + +## Get started with Index Management using Dashboards + +**Step 1: Open Index Management** + +- Once you're in OpenSearch Dashboards, select **Index Management** from the **OpenSearch Plugins** main menu. Then select **Indices**. + +**Step 2: View indexes** + +- In the **Indices** interface you will see a list of existing indexes in your OpenSearch cluster. The list provides information such as index name, health state, document count, index size, and other relevant details. + +**Step 3: Create an index** + +- To create a new index, select the **Create index** button in the upper-right corner. You will be prompted to enter the index name and configure the index settings, such as number of shards and replicas. Fill in the required information and select **Create** to create the index. + +**Step 4: Delete an index** + +- To delete an index, locate the index and select the checkbox next to it. Then select the **Actions** button and choose **Delete** from the dropdown list. Use caution when deleting indexes because this action is irreversible. + +**Step 5: Modify an index** + +- To modify the settings of an existing index, locate the index in the list and select its name. This takes you to the index details page. Here you can update settings such as the numbers of shards, replicas, and other advanced configurations. After making the desired changes, select **Save**. + +**Step 7: Refresh indexes** + +- To refresh an index, locate the index and select the checkbox next to it. Then select the **Actions** button and choose **Refresh** from the dropdown list. + +**Step 8: Filter and search indexes** + +- If you have a large number of indexes and want to filter or search for specific indexes, you can use the search bar located above the list of indexes. Enter the relevant keywords or filters to narrow the list of indexes. + +**Step 9: Additional operations** + +- Index Management provides additional functionalities such as creating index patterns, managing lifecycle policies, and configuring index templates. These options are available in their respective sections of the Index Management interface. diff --git a/_dashboards/im-dashboards/notifications.md b/_dashboards/im-dashboards/notifications.md new file mode 100644 index 00000000..cc7b6e07 --- /dev/null +++ b/_dashboards/im-dashboards/notifications.md @@ -0,0 +1,60 @@ +--- +layout: default +title: Notification settings +parent: Index Management +nav_order: 60 +--- + +# Notification settings + +You can configure global default notification settings for index operations on the **Notification settings** page. You can also configure additional notification settings for individual index operations. + +## Configuring default notification settings + +In the **Notification settings** interface, you can configure the default notification settings for the following index operations that may take longer to complete: + +- Open +- Reindex +- Split +- Shrink +- Clone +- Force merge + +To get started, from the OpenSearch Dashboards main menu, select **OpenSearch Plugins** > **Index Management**. Under **Index Management**, select **Notification settings**. + +You can choose to be notified when the operation has completed or failed. Additionally, you can select the notification channels for this notification, as shown in the following image. + +![Default notification settings]({{site.url}}{{site.baseurl}}/images/admin-ui-index/notifications.png) + +If you don't have permission to view notification settings, you cannot view the default settings. +{: .note} + +## Configuring notification settings for an individual operation + +You can view default notification settings when you perform an indexing operation as well as set up additional notifications. For example, if you want to configure an additional notification for a reindex operation, perform the following steps: + +1. Select **OpenSearch Plugins** > **Index Management**. + +1. In the **Index Management** interface, select **Indices**. + +1. Select the index you want to reindex. + +1. Select **Reindex** from the **Actions** dropdown list. + +1. After selecting all reindex options, expand **Advanced settings**. Under **Notifications**, default notifications are listed. + + If you don't have permission to view notification settings, you will not be able to view the default settings. + {: .note} + +1. To receive additional notifications, select **Send additional notifications**, as shown in the following image. + + ![Individual notification settings]({{site.url}}{{site.baseurl}}/images/admin-ui-index/notifications-individual.png) + +1. Select whether you want to be notified when the operation has failed or completed. + +1. Select a channel from the **Notification channels** dropdown list. If you want to configure a new notification channel, select **Manage channels**. + + To configure a new notification channel, confirm that the `dashboards-notification` plugin is enabled in OpenSearch Dashboards. + {: .note} + +1. Select the **Reindex** button. diff --git a/_dashboards/im-dashboards/rollover.md b/_dashboards/im-dashboards/rollover.md new file mode 100644 index 00000000..56b29df7 --- /dev/null +++ b/_dashboards/im-dashboards/rollover.md @@ -0,0 +1,54 @@ +--- +layout: default +title: Rollover +parent: Index Management +nav_order: 40 +redirect_from: + - /dashboards/admin-ui-index/rollover/ +--- + +# Rollover +Introduced 2.6 +{: .label .label-purple } + +OpenSearch Dashboards allows you to perform an [index rollover]({{site.url}}{{site.baseurl}}/im-plugin/ism/error-prevention/index/#rollover) operation with **Index Management**. + +## Data streams + +To perform a rollover operation on a data stream, perform the following steps: + +1. Under **Index Management**, choose **Data streams**. + +1. Choose **Actions**, and then choose **Roll over**, as shown in the following image. + + ![Roll over]({{site.url}}{{site.baseurl}}/images/admin-ui-index/rollover1.png) + +1. Under **Configure source**, select the source data stream on which you want to perform the rollover operation. + +1. Choose **Roll over**, as shown in the following image. + + ![Roll over]({{site.url}}{{site.baseurl}}/images/admin-ui-index/rollover3.png) + +## Aliases + +To perform a rollover operation on an alias, perform the following steps: + +1. Under **Index Management**, choose **Aliases**. + +1. Choose **Actions**, and then choose **Roll over**, as shown in the following image. + + ![Roll over]({{site.url}}{{site.baseurl}}/images/admin-ui-index/rollover2.png) + +1. Under **Configure source**, select the source alias on which you want to perform the rollover operation. + +1. If the alias does not contain a write index, you are prompted to assign a write index, as shown in the following image. + + ![Roll over]({{site.url}}{{site.baseurl}}/images/admin-ui-index/rollover4.png) + +1. Under **Configure a new rollover index** and on the **Define index** pane, specify an index name and an optional index alias. + +1. Under **Index settings**, specify the number of primary shards, the number of replicas, and the refresh interval, as shown in the following image. + + ![Roll over]({{site.url}}{{site.baseurl}}/images/admin-ui-index/rollover5.png) + +1. Choose **Roll over**. diff --git a/_dashboards/index.md b/_dashboards/index.md index df5a9516..8284d173 100644 --- a/_dashboards/index.md +++ b/_dashboards/index.md @@ -1,25 +1,71 @@ --- layout: default -title: About Dashboards +title: OpenSearch Dashboards nav_order: 1 has_children: false -has_toc: false +nav_exclude: true +permalink: /dashboards/ redirect_from: - - /docs/opensearch-dashboards/ - - /dashboards/ + - /dashboards/index/ --- -{%- comment -%}The `/docs/opensearch-dashboards/` redirect is specifically to support the UI links in OpenSearch Dashboards 1.0.0.{%- endcomment -%} - # OpenSearch Dashboards -OpenSearch Dashboards is the default visualization tool for data in OpenSearch. It also serves as a user interface for many of the OpenSearch plugins, including security, alerting, Index State Management, SQL, and more. +OpenSearch Dashboards is the user interface that lets you visualize your OpenSearch data and run and scale your OpenSearch clusters. + +## Getting started + +| Concept | Description | +|---------|-------------| +| [OpenSearch Dashboards Quickstart]({{site.url}}{{site.baseurl}}/dashboards/quickstart-dashboards/) | Learn about the basic concepts and features of OpenSearch Dashboards. | +| [OpenSearch Playground](https://playground.opensearch.org/app/home#/) | Explore features in OpenSearch Dashboards without downloading or installing anything. | +| [Install and configure OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/index/) | Get started with OpenSearch Dashboards. | +| [Create visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/) | Learn about visualizing data in OpenSearch Dashboards. | +| [Explore and query data]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/) | Learn how to explore and query data in OpenSearch. | + +## Query languages + +Query language | Where you can use it | Description +:--- | :--- | :--- +[Query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/index/) | [Dev Tools]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/index-dev/) | The primary OpenSearch query language that supports creating complex, fully customizable queries. +[Dashboards Query Language (DQL)]({{site.url}}{{site.baseurl}}/dashboards/discover/dql/) | [Discover]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/) and [Dashboard]({{site.url}}{{site.baseurl}}/dashboards/dashboard/index/) search bar | A simple text-based query language used to filter data in OpenSearch Dashboards. +[Query string query language]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) | [Discover]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/) and [Dashboard]({{site.url}}{{site.baseurl}}/dashboards/dashboard/index/) search bar | A scaled-down query language whose syntax is based on the Apache Lucene query syntax. +[SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/sql/index/) | [Query Workbench]({{site.url}}{{site.baseurl}}/dashboards/query-workbench/) | A traditional query language that bridges the gap between relational database concepts and the flexibility of OpenSearch’s document-oriented data storage. +[Piped Processing Language (PPL)]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/) | [Query Workbench]({{site.url}}{{site.baseurl}}/dashboards/query-workbench/) | The primary language used with observability in OpenSearch. PPL uses a pipe syntax that chains commands into a query. + +### Discover and Dashboard search bar + +Using the search bar in the [Discover]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/) and [Dashboard]({{site.url}}{{site.baseurl}}/dashboards/dashboard/index/) apps, you can search data with the following two languages: + +- [DQL]({{site.url}}{{site.baseurl}}/dashboards/discover/dql/) + +- [Query string query (Lucene)]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) + +The following table compares DQL and query string query language features. + +DQL and query string query language | DQL | Query string query language +:--- | :--- | :--- +- Wildcard expressions (DQL supports `*` only)
- Ranges
- Boolean operations
| - Querying nested fields | - Regular expressions
- Fuzziness
- Proximity queries
- Boosting + +By default, the query language in the Discover search toolbar is DQL. To switch to query string syntax, select **DQL** and then turn off **OpenSearch Dashboards Query Language**. The query language changes to `Lucene`, as shown in the following image. + +![Using query string syntax in OpenSearch Dashboards Discover]({{site.url}}{{site.baseurl}}/images/discover-lucene-syntax.png) + +## Observability + +| Concept | Description | +|---------|-------------| +| [Observability in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}//observing-your-data/index/) | Observe, monitor, and secure data and improve performance across tools and workflows. | -## Get started with OpenSearch Dashboards +## Dashboards Management -1. After starting OpenSearch Dashboards, you can access it at port 5601. For example, http://localhost:5601. -1. Log in with the default username `admin` and password `admin`. -1. Choose **Try our sample data** and add the sample flight data. -1. Choose **Discover** and search for a few flights. -1. Choose **Dashboard**, **[Flights] Global Flight Dashboard**, and wait for the dashboard to load. +| Concept | Description | +|---------|-------------| +| [Dashboards Management]({{site.url}}{{site.baseurl}}/dashboards/management/management-index/) | Learn about the command center for customizing your OpenSearch Dashboards behavior, creating index patterns, and configuring data sources. | + +## Dev Tools + +| Concept | Description | +|---------|-------------| +| [Dev Tools]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/index-dev/) | Learn how to run OpenSearch queries in an integrated console. | diff --git a/_dashboards/install/index.md b/_dashboards/install/index.md deleted file mode 100644 index aa9a9461..00000000 --- a/_dashboards/install/index.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -layout: default -title: Install OpenSearch Dashboards -nav_order: 1 -has_children: true -redirect_from: - - /dashboards/install/ ---- - -# Install and configure OpenSearch Dashboards - -OpenSearch Dashboards has three installation options at this time: Docker images, tarballs, and Helm charts. diff --git a/_dashboards/install/tls.md b/_dashboards/install/tls.md deleted file mode 100644 index 546dd36b..00000000 --- a/_dashboards/install/tls.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: default -title: Configure TLS -parent: Install OpenSearch Dashboards -nav_order: 40 ---- - -# Configure TLS for OpenSearch Dashboards - -By default, for ease of testing and getting started, OpenSearch Dashboards runs over HTTP. To enable TLS for HTTPS, update the following settings in `opensearch_dashboards.yml`. - -Setting | Description -:--- | :--- -opensearch.ssl.verificationMode | This setting is for communications between OpenSearch and OpenSearch Dashboards. Valid values are `full`, `certificate`, or `none`. We recommend `full` if you enable TLS, which enables hostname verification. `certificate` just checks the certificate, not the hostname, and `none` performs no checks (suitable for HTTP). Default is `full`. -opensearch.ssl.certificateAuthorities | If `opensearch.ssl.verificationMode` is `full` or `certificate`, specify the full path to one or more CA certificates that comprise a trusted chain for your OpenSearch cluster. For example, you might need to include a root CA _and_ an intermediate CA if you used the intermediate CA to issue your admin, client, and node certificates. -server.ssl.enabled | This setting is for communications between OpenSearch Dashboards and the web browser. Set to true for HTTPS, false for HTTP. -server.ssl.certificate | If `server.ssl.enabled` is true, specify the full path to a valid client certificate for your OpenSearch cluster. You can [generate your own]({{site.url}}{{site.baseurl}}/security-plugin/configuration/generate-certificates/) or get one from a certificate authority. -server.ssl.key | If `server.ssl.enabled` is true, specify the full path (e.g. `/usr/share/opensearch-dashboards-1.0.0/config/my-client-cert-key.pem` to the key for your client certificate. You can [generate your own]({{site.url}}{{site.baseurl}}/security-plugin/configuration/generate-certificates/) or get one from a certificate authority. -opensearch_security.cookie.secure | If you enable TLS for OpenSearch Dashboards, change this setting to `true`. For HTTP, set it to `false`. - -This `opensearch_dashboards.yml` configuration shows OpenSearch and OpenSearch Dashboards running on the same machine with the demo configuration: - -```yml -opensearch.hosts: ["https://localhost:9200"] -opensearch.ssl.verificationMode: full -opensearch.username: "kibanaserver" -opensearch.password: "kibanaserver" -opensearch.requestHeadersWhitelist: [ authorization,securitytenant ] -server.ssl.enabled: true -server.ssl.certificate: /usr/share/opensearch-dashboards/config/client-cert.pem -server.ssl.key: /usr/share/opensearch-dashboards/config/client-cert-key.pem -opensearch.ssl.certificateAuthorities: [ "/usr/share/opensearch-dashboards/config/root-ca.pem", "/usr/share/opensearch-dashboards/config/intermediate-ca.pem" ] -opensearch_security.multitenancy.enabled: true -opensearch_security.multitenancy.tenants.preferred: ["Private", "Global"] -opensearch_security.readonly_mode.roles: ["kibana_read_only"] -opensearch_security.cookie.secure: true -``` - -If you use the Docker install, you can pass a custom `opensearch_dashboards.yml` to the container. To learn more, see the [Docker installation page]({{site.url}}{{site.baseurl}}/opensearch/install/docker/). - -After enabling these settings and starting OpenSearch Dashboards, you can connect to it at `https://localhost:5601`. You might have to acknowledge a browser warning if your certificates are self-signed. To avoid this sort of warning (or outright browser incompatibility), best practice is to use certificates from trusted certificate authority. diff --git a/_dashboards/management/S3-data-source.md b/_dashboards/management/S3-data-source.md new file mode 100644 index 00000000..585edeac --- /dev/null +++ b/_dashboards/management/S3-data-source.md @@ -0,0 +1,60 @@ +--- +layout: default +title: Connecting Amazon S3 to OpenSearch +parent: Data sources +nav_order: 15 +has_children: true +--- + +# Connecting Amazon S3 to OpenSearch +Introduced 2.11 +{: .label .label-purple } + +Starting with OpenSearch 2.11, you can connect OpenSearch to your Amazon Simple Storage Service (Amazon S3) data source using the OpenSearch Dashboards UI. You can then query that data, optimize query performance, define tables, and integrate your S3 data within a single UI. + +## Prerequisites + +To connect data from Amazon S3 to OpenSearch using OpenSearch Dashboards, you must have: + +- Access to Amazon S3 and the [AWS Glue Data Catalog](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/connectors/s3glue_connector.rst#id2). +- Access to OpenSearch and OpenSearch Dashboards. +- An understanding of OpenSearch data source and connector concepts. See the [developer documentation](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/datasources.rst#introduction) for information about these concepts. + +## Connect your Amazon S3 data source + +To connect your Amazon S3 data source, follow these steps: + +1. From the OpenSearch Dashboards main menu, select **Management** > **Data sources**. +2. On the **Data sources** page, select **New data source** > **S3**. An example UI is shown in the following image. + + Amazon S3 data sources UI + +3. On the **Configure Amazon S3 data source** page, enter the required **Data source details**, **AWS Glue authentication details**, **AWS Glue index store details**, and **Query permissions**. An example UI is shown in the following image. + + Amazon S3 configuration UI + +4. Select the **Review Configuration** button and verify the details. +5. Select the **Connect to Amazon S3** button. + +## Manage your Amazon S3 data source + +Once you've connected your Amazon S3 data source, you can explore your data through the **Manage data sources** tab. The following steps guide you through using this functionality: + +1. On the **Manage data sources** tab, choose a date source from the list. +2. On that data source's page, you can manage the data source, choose a use case, and manage access controls and configurations. An example UI is shown in the following image. + + Manage data sources UI + +3. (Optional) Explore the Amazon S3 use cases, including querying your data and optimizing query performance. Go to **Next steps** to learn more about each use case. + +## Limitations + +This feature is still under development, including the data integration functionality. For real-time updates, see the [developer documentation on GitHub](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#limitations). + +## Next steps + +- Learn about [querying your data in Data Explorer]({{site.url}}{{site.baseurl}}/dashboards/management/query-data-source/) through OpenSearch Dashboards. +- Learn about ways to [optimize the query performance of your external data sources]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/), such as Amazon S3, through Query Workbench. +- Learn about [Amazon S3 and AWS Glue Data Catalog](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/admin/connectors/s3glue_connector.rst) and the APIS used with Amazon S3 data sources, including configuration settings and query examples. +- Learn about [managing your indexes]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/index/) through OpenSearch Dashboards. + diff --git a/_dashboards/management/accelerate-external-data.md b/_dashboards/management/accelerate-external-data.md new file mode 100644 index 00000000..00e4600f --- /dev/null +++ b/_dashboards/management/accelerate-external-data.md @@ -0,0 +1,66 @@ +--- +layout: default +title: Optimize query performance using OpenSearch indexing +parent: Connecting Amazon S3 to OpenSearch +grand_parent: Data sources +nav_order: 15 +has_children: false +--- + +# Optimize query performance using OpenSearch indexing +Introduced 2.11 +{: .label .label-purple } + + +Query performance can be slow when using external data sources for reasons such as network latency, data transformation, and data volume. You can optimize your query performance by using OpenSearch indexes, such as a skipping index or a covering index. A _skipping index_ uses skip acceleration methods, such as partition, minimum and maximum values, and value sets, to ingest and create compact aggregate data structures. This makes them an economical option for direct querying scenarios. A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. See the [Flint Index Reference Manual](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md) for comprehensive guidance on this feature's indexing process. + +## Data sources use case: Accelerate performance + +To get started with the **Accelerate performance** use case available in **Data sources**, follow these steps: + +1. Go to **OpenSearch Dashboards** > **Query Workbench** and select your Amazon S3 data source from the **Data sources** dropdown menu in the upper-left corner. +2. From the left-side navigation menu, select a database. An example using the `http_logs` database is shown in the following image. + + Query Workbench accelerate data UI + +3. View the results in the table and confirm that you have the desired data. +4. Create an OpenSearch index by following these steps: + 1. Select the **Accelerate data** button. A pop-up window appears. An example is shown in the following image. + + Accelerate data pop-up window + + 2. Enter your details in **Select data fields**. In the **Database** field, select the desired acceleration index: **Skipping index** or **Covering index**. A _skipping index_ uses skip acceleration methods, such as partition, min/max, and value sets, to ingest data using compact aggregate data structures. This makes them an economical option for direct querying scenarios. A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. + +5. Under **Index settings**, enter the information for your acceleration index. For information about naming, select **Help**. Note that an Amazon S3 table can only have one skipping index at a time. An example is shown in the following image. + + Skipping index settings + +### Define skipping index settings + +1. Under **Skipping index definition**, select the **Add fields** button to define the skipping index acceleration method and choose the fields you want to add. An example is shown in the following image. + + Skipping index add fields + +2. Select the **Copy Query to Editor** button to apply your skipping index settings. +3. View the skipping index query details in the table pane and then select the **Run** button. Your index is added to the left-side navigation menu containing the list of your databases. An example is shown in the following image. + + Run a skippping or covering index UI + +### Define covering index settings + +1. Under **Index settings**, enter a valid index name. Note that each Amazon S3 table can have multiple covering indexes. An example is shown in the following image. + + Covering index settings + +2. Once you have added the index name, define the covering index fields by selecting `(add fields here)` under **Covering index definition**. An example is shown in the following image. + + Covering index field naming + +3. Select the **Copy Query to Editor** button to apply your covering index settings. +4. View the covering index query details in the table pane and then select the **Run** button. Your index is added to the left-side navigation menu containing the list of your databases. An example UI is shown in the following image. + + Run index in Query Workbench + +## Limitations + +This feature is still under development, so there are some limitations. For real-time updates, see the [developer documentation on GitHub](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#limitations). diff --git a/_dashboards/management/advanced-settings.md b/_dashboards/management/advanced-settings.md new file mode 100644 index 00000000..8602baea --- /dev/null +++ b/_dashboards/management/advanced-settings.md @@ -0,0 +1,23 @@ +--- +layout: default +title: Advanced settings +parent: Dashboards Management +nav_order: 40 +--- + +# Advanced settings +Updated 2.10 +{: .label .label-purple } + +Use the **Advanced settings** page to modify settings that govern OpenSearch Dashboards behavior. These settings can be used to customize the look and feel of the application, change the behavior of certain features, and more. A view of the interface is shown in the following image. + +Advanced settings interface + +To access **Advanced settings**, go to **Dashboards Management** and select **Advanced settings**. The page is divided into several sections, each containing a set of related settings. You can modify these settings by editing their fields. Once you've made the changes, select **Save** to apply them. + +{::nomarkdown}alert icon{:/} **Note**
Certain settings require you to modify [the `opensearch_dashboards.yml` file](https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/config/opensearch_dashboards.yml) and restart OpenSearch Dashboards. +{: .note} + +## Required permissions + +To modify settings, you must have permission to make changes. See [Multi-tenancy configuration](https://opensearch.org/docs/latest/security/multi-tenancy/multi-tenancy-config/#give-roles-access-to-tenants) for guidance about assigning role access to tenants. diff --git a/_dashboards/management/data-sources.md b/_dashboards/management/data-sources.md new file mode 100644 index 00000000..615a6ae5 --- /dev/null +++ b/_dashboards/management/data-sources.md @@ -0,0 +1,73 @@ +--- +layout: default +title: Data sources +nav_order: 110 +has_children: true +--- + +# Data sources + +OpenSearch data sources are the applications that OpenSearch can connect to and ingest data from. Once your data sources have been connected and your data has been ingested, it can be indexed, searched, and analyzed using [REST APIs]({{site.url}}{{site.baseurl}}/api-reference/index/) or the OpenSearch Dashboards UI. + +This documentation focuses on using the OpenSeach Dashboards interface to connect and manage your data sources. For information about using an API to connect data sources, see the developer resources linked under [Next steps](#next-steps). + +## Prerequisites + +The first step in connecting your data sources to OpenSearch is to install OpenSearch and OpenSearch Dashboards on your system. You can follow the installation instructions in the [OpenSearch documentation]({{site.url}}{{site.baseurl}}/install-and-configure/index/) to install these tools. + +Once you have installed OpenSearch and OpenSearch Dashboards, you can use Dashboards to connect your data sources to OpenSearch and then use Dashboards to manage data sources, create index patterns based on those data sources, run queries against a specific data source, and combine visualizations in one dashboard. + +Configuration of the [YAML files]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/#configuration-file) and installation of the `dashboards-observability` and `opensearch-sql` plugins is necessary. For more information, see [OpenSearch plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). + +## Create a data source connection + +A data source connection specifies the parameters needed to connect to a data source. These parameters form a connection string for the data source. Using Dashboards, you can add new data source connections or manage existing ones. + +The following steps guide you through the basics of creating a data source connection: + +1. From the OpenSearch Dashboards main menu, select **Dashboards Management** > **Data sources** > **Create data source connection**. The UI is shown in the following image. + + Connecting a data source UI + +2. Create the data source connection by entering the appropriate information into the **Connection Details** and **Authentication Method** fields. + + - Under **Connection Details**, enter a title and endpoint URL. For this tutorial, use the URL `http://localhost:5601/app/management/opensearch-dashboards/dataSources`. Entering a description is optional. + + - Under **Authentication Method**, select an authentication method from the dropdown list. Once an authentication method is selected, the applicable fields for that method appear. You can then enter the required details. The authentication method options are: + - **No authentication**: No authentication is used to connect to the data source. + - **Username & Password**: A basic username and password are used to connect to the data source. + - **AWS SigV4**: An AWS Signature Version 4 authenticating request is used to connect to the data source. AWS Signature Version 4 requires an access key and a secret key. + - For AWS Signature Version 4 authentication, first specify the **Region**. Next, select the OpenSearch service in the **Service Name** list. The options are **Amazon OpenSearch Service** and **Amazon OpenSearch Serverless**. Lastly, enter the **Access Key** and **Secret Key** for authorization. + + After you have populated the required fields, the **Test connection** and **Create data source** buttons become active. You can select **Test connection** to confirm that the connection is valid. + +3. Select **Create data source** to save your settings. The connection is created. The active window returns to the **Data sources** main page, and the new connection appears in the list of data sources. + +4. To delete a data source connection, select the checkbox to the left of the data source **Title** and then select the **Delete 1 connection** button. Selecting multiple checkboxes for multiple connections is supported. An example UI is shown in the following image. + + Deleting a data source UI + +### Modify a data source connection + +To make changes to a data source connection, select a connection in the list on the **Data sources** main page. The **Connection Details** window opens. + +To make changes to **Connection Details**, edit one or both of the **Title** and **Description** fields and select **Save changes** in the lower-right corner of the screen. You can also cancel changes here. To change the **Authentication Method**, choose a different authentication method, enter your credentials (if applicable), and then select **Save changes** in the lower-right corner of the screen. The changes are saved. + +When **Username & Password** is the selected authentication method, you can update the password by choosing **Update stored password** next to the **Password** field. In the pop-up window, enter a new password in the first field and then enter it again in the second field to confirm. Select **Update stored password** in the pop-up window. The new password is saved. Select **Test connection** to confirm that the connection is valid. + +When **AWS SigV4** is the selected authentication method, you can update the credentials by selecting **Update stored AWS credential**. In the pop-up window, enter a new access key in the first field and a new secret key in the second field. Select **Update stored AWS credential** in the pop-up window. The new credentials are saved. Select **Test connection** in the upper-right corner of the screen to confirm that the connection is valid. + +To delete the data source connection, select the delete icon ({::nomarkdown}delete icon{:/}). + +## Create an index pattern + +Once you've created a data source connection, you can create an index pattern for the data source. An _index pattern_ is a template that OpenSearch uses to create indexes for data from the data source. See [Index patterns]({{site.url}}{{site.baseurl}}/dashboards/management/index-patterns/) for more information and a tutorial. + +## Next steps + +- Learn about [managing index patterns]({{site.url}}{{site.baseurl}}/dashboards/management/index-patterns/) through OpenSearch Dashboards. +- Learn about [indexing data using Index Management]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/index/) through OpenSearch Dashboards. +- Learn about how to connect [multiple data sources]({{site.url}}{{site.baseurl}}/dashboards/management/multi-data-sources/). +- Learn about how to [connect OpenSearch and Amazon S3 through OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/management/S3-data-source/). +- Learn about the [Integrations]({{site.url}}{{site.baseurl}}/integrations/index/) tool, which gives you the flexibility to use various data ingestion methods and connect data from the Dashboards UI. + diff --git a/_dashboards/management/index-patterns.md b/_dashboards/management/index-patterns.md new file mode 100644 index 00000000..590a9675 --- /dev/null +++ b/_dashboards/management/index-patterns.md @@ -0,0 +1,64 @@ +--- +layout: default +title: Index patterns +parent: Dashboards Management +nav_order: 10 +--- + +# Index patterns + +Index patterns are essential for accessing OpenSearch data. An _index pattern_ references one or more indexes, data streams, or index aliases. For example, an index pattern can point you to your log data from yesterday or all indexes that contain that data. + +If you store data in multiple indexes, creating an index pattern enables your visualizations to retrieve data from all indexes that match the index pattern. You need to create index patterns to define how data is retrieved and fields are formatted so that you can query, search, and display data. + +## Get started + +In this tutorial, you'll learn to create index patterns. + +{::nomarkdown}alert icon{:/}**Note**
+To create or modify index patterns, you must have create, manage, and delete permissions. Contact your administrator for support. For more information, refer to [Multi-tenancy configuration]({{site.url}}{{site.baseurl}}/security/multi-tenancy/multi-tenancy-config/#give-roles-access-to-tenants). +{: .note} + +## Prerequisites + +Before you can create an index pattern, your data must be indexed. To learn about indexing your data in OpenSearch, see [Managing indexes]({{site.url}}{{site.baseurl}}/im-plugin/index/). + +## Best practices + +Consider the following best practices when creating index patterns: + +- **Make your index patterns specific.** Instead of creating an index pattern that matches all indexes, create an index pattern that matches all indexes starting with a certain prefix, for example, `my-index-`. The more specific your index patterns, the better it will be to query and analyze your data. +- **Use wildcards sparingly.** Wildcards can be useful for matching multiple indexes, but they can also make it more difficult to manage your index patterns. Try to use wildcards as specifically as possible. +- **Test your index patterns.** Make sure to test your index patterns to ensure that they match the correct indexes. + +## Creating an index pattern + +If you added sample data, you have index patterns that you can use to analyze that data. To create an index pattern for your own data, follow these steps. + +### Step 1: Define the index pattern + +1. Go to OpenSearch Dashboards, and select **Management** > **Dashboards Management** > **Index patterns**. +2. Select **Create index pattern**. +3. From the **Create index pattern** window, define the index pattern by entering a name for your index pattern in the **Index pattern name** field. Dashboards automatically adds a wildcard, `*`, once you start typing. Using a wildcard is helpful for matching an index pattern to multiple sources or indexes. A dropdown list displaying all the indexes that match your index pattern appears when you start typing. +4. Select **Next step**. + +An example of step 1 is shown in the following image. Note that the index pattern `security*` matches three indexes. By defining the pattern with a wildcard `*`, you can query and visualize all the data in your indexes. + +Index pattern step 1 UI + +### Step 2: Configure the settings + +1. Select `@timestamp` from the dropdown menu to specify the time field for OpenSearch to use when filtering documents based on time. Selecting this time filter determines which field the time filter is applied to. It can be the timestamp of a request or any relevant timestamp field. If you don't want to use a time filter, select that option from the dropdown menu. If you select this option, OpenSearch returns all of the data in the indexes that match the pattern. + +2. Select **Create index pattern.** An example is shown in the following image. + + Index pattern step 2 UI + +Once the index pattern has been created, you can view the mapping of the matching indexes. Within the table, you can see the list of fields, along with their data type and properties. An example is shown in the following image. + +Index pattern table UI + +## Next steps + +- [Understand your data through visuals]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/). +- [Dig into your data]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/). diff --git a/_dashboards/management/management-index.md b/_dashboards/management/management-index.md new file mode 100644 index 00000000..7edc4d06 --- /dev/null +++ b/_dashboards/management/management-index.md @@ -0,0 +1,26 @@ +--- +layout: default +title: Dashboards Management +nav_order: 100 +has_children: true +--- + +# Dashboards Management +Introduced 2.10 +{: .label .label-purple } + +**Dashboards Management** serves as the command center for customizing OpenSearch Dashboards to your needs. A view of the interface is shown in the following image. + +Dashboards Management interface + +{::nomarkdown}alert icon{:/} **Note**
OpenSearch and OpenSearch Dashboards privileges govern access to individual features. If you do not have the appropriate access, consult your administrator. +{: .note} + +## Applications + +The following applications are available in **Dashboards Management**: + +- **[Index Patterns]({{site.url}}{{site.baseurl}}/dashboards/management/index-patterns/):** To access OpenSearch data, you need to create an index pattern so that you can select the data you want to use and define the properties of the fields. The Index Pattern tool gives you the ability to create an index pattern from within the UI. Index patterns point to one or more indexes, data streams, or index aliases. +- **[Data Sources]({{site.url}}{{site.baseurl}}/dashboards/management/multi-data-sources/):** The Data Sources tool is used to configure and manage the data sources that OpenSearch uses to collect and analyze data. You can use the tool to specify the source configuration in your copy of the [OpenSearch Dashboards configuration file]({{site.url}}{{site.baseurl}}https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/config/opensearch_dashboards.yml). +- **[Saved Objects](https://opensearch.org/blog/enhancement-multiple-data-source-import-saved-object/):** The Saved Objects tool helps you organize and manage your saved objects. Saved objects are files that store data, such as dashboards, visualizations, and maps, for later use. +- **[Advanced Settings]({{site.url}}{{site.baseurl}}/dashboards/management/advanced-settings/):** The Advanced Settings tool gives you the flexibility to personalize the behavior of OpenSearch Dashboards. The tool is divided into settings sections, such as General, Accessibility, and Notifications, and you can use it to customize and optimize many of your Dashboards settings. diff --git a/_dashboards/management/multi-data-sources.md b/_dashboards/management/multi-data-sources.md new file mode 100644 index 00000000..04473486 --- /dev/null +++ b/_dashboards/management/multi-data-sources.md @@ -0,0 +1,124 @@ +--- +layout: default +title: Configuring and using multiple data sources +parent: Data sources +nav_order: 10 +redirect_from: + - /dashboards/discover/multi-data-sources/ +--- + +# Configuring and using multiple data sources + +You can ingest, process, and analyze data from multiple data sources in OpenSearch Dashboards. You configure the data sources in the **Dashboards Management** > **Data sources** app, as shown in the following image. + + +Dashboards Management Data sources main screen + +## Getting started + +The following tutorial guides you through configuring and using multiple data sources. + +### Step 1: Modify the YAML file settings + +To use multiple data sources, you must enable the `data_source.enabled` setting. It is disabled by default. To enable multiple data sources: + +1. Open your local copy of the OpenSearch Dashboards configuration file, `opensearch_dashboards.yml`. If you don't have a copy, [`opensearch_dashboards.yml`](https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/config/opensearch_dashboards.yml) is available on GitHub. +2. Set `data_source.enabled:` to `true` and save the YAML file. +3. Restart the OpenSearch Dashboards container. +4. Verify that the configuration settings were configured properly by connecting to OpenSearch Dashboards and viewing the **Dashboards Management** navigation menu. **Data sources** appears in the sidebar. You'll see a view similar to the following image. + + Data sources in sidebar within Dashboards Management + +### Step 2: Create a new data source connection + +A data source connection specifies the parameters needed to connect to a data source. These parameters form a connection string for the data source. + +To create a new data source connection: + +1. From the OpenSearch Dashboards main menu, select **Dashboards Management** > **Data sources** > **Create data source connection**. +2. Add the required information to each field to configure **Connection Details** and **Authentication Method**. + + - Under **Connection Details**, enter a title and endpoint URL. For this tutorial, use the URL `http://localhost:5601/app/management/opensearch-dashboards/dataSources`. Entering a description is optional. + + - Under **Authentication Method**, select an authentication method from the dropdown list. Once an authentication method is selected, the applicable fields for that method appear. You can then enter the required details. The authentication method options are: + - **No authentication**: No authentication is used to connect to the data source. + - **Username & Password**: A basic username and password are used to connect to the data source. + - **AWS SigV4**: An AWS Signature Version 4 authenticating request is used to connect to the data source. AWS Signature Version 4 requires an access key and a secret key. + - For AWS Signature Version 4 authentication, first specify the **Region**. Next, select the OpenSearch service in the **Service Name** list. The options are **Amazon OpenSearch Service** and **Amazon OpenSearch Serverless**. Last, enter the **Access Key** and **Secret Key** for authorization. + + For information about available AWS Regions for AWS accounts, see [Available Regions](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html#concepts-available-regions). For more information about AWS Signature Version 4 authentication requests, see [Authenticating Requests (AWS Signature Version 4)](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-authenticating-requests.html). + {: .note} + + - After you have entered the appropriate details in all of the required fields, the **Test connection** and **Create data source** buttons become active. You can select **Test connection** to confirm that the connection is valid. + +3. Select **Create data source** to save your settings. The connection is created. The active window returns to the **Data Sources** main page, and the new connection appears in the list of data sources. + +4. Edit or update a data source connection. + + - To make changes to the data source connection, select a connection in the list on the **Data Sources** main page. The **Connection Details** window opens. + + - To make changes to **Connection Details**, edit one or both of the **Title** and **Description** fields and select **Save changes** in the lower-right corner of the screen. You can also cancel changes here. To change the **Authentication Method**, choose a different authentication method, enter your credentials (if applicable), and then select **Save changes** in the lower-right corner of the screen. The changes are saved. + + - When **Username & Password** is the selected authentication method, you can update the password by choosing **Update stored password** next to the **Password** field. In the pop-up window, enter a new password in the first field and then enter it again in the second field to confirm. Select **Update stored password** in the pop-up window. The new password is saved. Select **Test connection** to confirm that the connection is valid. + + - When **AWS SigV4** is the selected authentication method, you can update the credentials by selecting **Update stored AWS credential**. In the pop-up window, enter a new access key in the first field and a new secret key in the second field. Select **Update stored AWS credential** in the pop-up window. The new credentials are saved. Select **Test connection** in the upper-right corner of the screen to confirm that the connection is valid. + +5. Delete the data source connection by selecting the check box to the left of the title and then choosing **Delete 1 connection**. Selecting multiple check boxes for multiple connections is supported. Alternatively, select the trash can icon ({::nomarkdown}trash can icon{:/}). + +An example data source connection screen is shown in the following image. + +Data source connection screen + +### Selecting multiple data sources through the Dev Tools console + +Alternatively, you can select multiple data sources through the [Dev Tools]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/index-dev/) console. This option provides for working with a broader range of data and gaining deeper insight into your code and applications. + +Watch the following 10-second video to see it in action. + +Multiple data sources in Dev Tools demo{: .img-fluid} + +To select a data source through the Dev Tools console, follow these steps: + +1. Locate your copy of `opensearch_dashboards.yml` and open it in the editor of your choice. +2. Set `data_source.enabled` to `true`. +3. Connect to OpenSearch Dashboards and select **Dev Tools** in the menu. +4. Enter the following query in the editor pane of the **Console** and then select the play button: + + ```json + GET /_cat/indices + ``` + {% include copy-curl.html %} + +5. From the **Data source** dropdown menu, select a data source and then query the source. +6. Repeat the preceding steps for each data source you want to select. + +### Upload saved objects to a dashboard from connected data sources +To upload saved objects from connected data sources to a dashboard with multiple data sources, export them as an NDJSON file from the data source's **Saved object management** page. Then upload the file to the dashboard's **Saved object management** page. This method can make it easier to transfer saved objects between dashboards. The following 20-second video shows this feature in action. + + +Multiple data sources in Saved object management{: .img-fluid} + +Follow these steps to import saved objects from a connected data source: + +1. Locate your `opensearch_dashboards.yml` file and open it in your preferred text editor. +2. Set `data_source.enabled` to `true`. +3. Connect to OpenSearch Dashboards and go to **Dashboards Management** > **Saved objects**. +4. Select **Import** > **Select file** and upload the file acquired from the connected data source. +5. Choose the appropriate **Data source** from the dropdown menu, set your **Conflict management** option, and then select the **Import** button. + +## Next steps + +Once you've configured your multiple data sources, you can start exploring that data. See the following resources to learn more: + +- Learn about [managing index patterns]({{site.url}}{{site.baseurl}}/dashboards/management/index-patterns/) through OpenSearch Dashboards. +- Learn about [indexing data using Index Management]({{site.url}}{{site.baseurl}}/dashboards/im-dashboards/index/) through OpenSearch Dashboards. +- Learn about how to [connect OpenSearch and Amazon S3 through OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/management/S3-data-source/). +- Learn about the [Integrations tool]({{site.url}}{{site.baseurl}}/integrations/index/), which gives you the flexibility to use various data ingestion methods and connect data from the Dashboards UI. + +## Limitations + +This feature has some limitations: + +* The multiple data sources feature is supported for index-pattern-based visualizations only. +* The visualization types Time Series Visual Builder (TSVB), Vega and Vega-Lite, and timeline are not supported. +* External plugins, such as Gantt chart, and non-visualization plugins, such as the developer console, are not supported. diff --git a/_dashboards/management/query-data-source.md b/_dashboards/management/query-data-source.md new file mode 100644 index 00000000..f1496b3e --- /dev/null +++ b/_dashboards/management/query-data-source.md @@ -0,0 +1,66 @@ +--- +layout: default +title: Query and visualize Amazon S3 data +parent: Connecting Amazon S3 to OpenSearch +grand_parent: Data sources +nav_order: 10 +has_children: false +--- + +# Query and visualize Amazon S3 data +Introduced 2.11 +{: .label .label-purple } + +This tutorial guides you through using the **Query data** use case for querying and visualizing your Amazon Simple Storage Service (Amazon S3) data using OpenSearch Dashboards. + +## Prerequisites + +You must be using the `opensearch-security` plugin and have the appropriate role permissions. Contact your IT administrator to assign you the necessary permissions. + +## Get started with querying + +To get started, follow these steps: + +1. On the **Manage data sources** page, select your data source from the list. +2. On the data source's detail page, select the **Query data** card. This option takes you to the **Observability** > **Logs** page, which is shown in the following image. + + Observability Logs UI + +3. Select the **Event Explorer** button. This option creates and saves frequently searched queries and visualizations using [Piped Processing Language (PPL)]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/) or [SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/index/), which connects to Spark SQL. +4. Select the Amazon S3 data source from the dropdown menu in the upper-left corner. An example is shown in the following image. + + Observability Logs Amazon S3 dropdown menu + +5. Enter the query in the **Enter PPL query** field. Note that the default language is SQL. To change the language, select PPL from the dropdown menu. +6. Select the **Search** button. The **Query Processing** message is shown, confirming that your query is being processed. +7. View the results, which are listed in a table on the **Events** tab. On this page, details such as available fields, source, and time are shown in a table format. +8. (Optional) Create data visualizations. + +## Create visualizations of your Amazon S3 data + +To create visualizations, follow these steps: + +1. On the **Explorer** page, select the **Visualizations** tab. An example is shown in the following image. + + Explorer Amazon S3 visualizations UI + +2. Select **Index data to visualize**. This option currently only creates [acceleration indexes]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/), which give you views of the data visualizations from the **Visualizations** tab. To create a visualization of your Amazon S3 data, go to **Discover**. See the [Discover documentation]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/) for information and a tutorial. + +## Use Query Workbench with your Amazon S3 data source + +[Query Workbench]({{site.url}}{{site.baseurl}}/search-plugins/sql/workbench/) runs on-demand SQL queries, translates SQL into its REST equivalent, and views and saves results as text, JSON, JDBC, or CSV. + +To use Query Workbench with your Amazon S3 data, follow these steps: + +1. From the OpenSearch Dashboards main menu, select **OpenSearch Plugins** > **Query Workbench**. +2. From the **Data Sources** dropdown menu in the upper-left corner, choose your Amazon S3 data source. Your data begins loading the databases that are part of your data source. An example is shown in the following image. + + Query Workbench Amazon S3 data loading UI + +3. View the databases listed in the left-side navigation menu and select a database to view its details. Any information about acceleration indexes is listed under **Acceleration index destination**. +4. Choose the **Describe Index** button to learn more about how data is stored in that particular index. +5. Choose the **Drop index** button to delete and clear both the OpenSearch index and the Amazon S3 Spark job that refreshes the data. +6. Enter your SQL query and select **Run**. +## Next steps + +- Learn about [accelerating the query performance of your external data sources]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/). diff --git a/_dashboards/maptiles.md b/_dashboards/maptiles.md deleted file mode 100644 index f7a43046..00000000 --- a/_dashboards/maptiles.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -layout: default -title: WMS map server -nav_order: 5 -redirect_from: - - /docs/opensearch-dashboards/maptiles/ ---- - -{%- comment -%}The `/docs/opensearch-dashboards/maptiles/` redirect is specifically to support the UI links in OpenSearch Dashboards 1.0.0.{%- endcomment -%} - -# Configure WMS map server - -OpenSearch Dashboards includes default map tiles, but if you need more specialized maps, you can configure OpenSearch Dashboards to use a WMS map server: - -1. Open OpenSearch Dashboards at `https://:`. For example, [https://localhost:5601](https://localhost:5601). -1. If necessary, log in. -1. Choose **Management** and **Advanced Settings**. -1. Locate `visualization:tileMap:WMSdefaults`. -1. Change `enabled` to true and add the URL of a valid WMS map server: - - ```json - { - "enabled": true, - "url": "", - "options": { - "format": "image/png", - "transparent": true - } - } - ``` - -Map services often have licensing fees or restrictions. You're responsible for all such considerations on any map server that you specify. -{: .note } diff --git a/_dashboards/query-workbench.md b/_dashboards/query-workbench.md new file mode 100644 index 00000000..8fe41afc --- /dev/null +++ b/_dashboards/query-workbench.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Query Workbench +nav_order: 125 +redirect_from: + - /search-plugins/sql/workbench/ +--- + +# Query Workbench + +Query Workbench is a tool within OpenSearch Dashboards. You can use Query Workbench to run on-demand [SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/sql/index/) and [PPL]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/) queries, translate queries into their equivalent REST API calls, and view and save results in different [response formats]({{site.url}}{{site.baseurl}}/search-plugins/sql/response-formats/). + +A view of the Query Workbench interface within OpenSearch Dashboards is shown in the following image. + +Query Workbench interface within OpenSearch Dashboards + +## Prerequisites + +Before getting started, make sure you have [indexed your data]({{site.url}}{{site.baseurl}}/im-plugin/index/). + +For this tutorial, you can index the following sample documents. Alternatively, you can use the [OpenSearch Playground](https://playground.opensearch.org/app/opensearch-query-workbench#/), which has preloaded indexes that you can use to try out Query Workbench. + +To index sample documents, send the following [Bulk API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) request: + +```json +PUT accounts/_bulk?refresh +{"index":{"_id":"1"}} +{"account_number":1,"balance":39225,"firstname":"Amber","lastname":"Duke","age":32,"gender":"M","address":"880 Holmes Lane","employer":"Pyrami","email":"amberduke@pyrami.com","city":"Brogan","state":"IL"} +{"index":{"_id":"6"}} +{"account_number":6,"balance":5686,"firstname":"Hattie","lastname":"Bond","age":36,"gender":"M","address":"671 Bristol Street","employer":"Netagy","email":"hattiebond@netagy.com","city":"Dante","state":"TN"} +{"index":{"_id":"13"}} +{"account_number":13,"balance":32838,"firstname":"Nanette","lastname":"Bates","age":28,"gender":"F","address":"789 Madison Street","employer":"Quility","email":"nanettebates@quility.com","city":"Nogal","state":"VA"} +{"index":{"_id":"18"}} +{"account_number":18,"balance":4180,"firstname":"Dale","lastname":"Adams","age":33,"gender":"M","address":"467 Hutchinson Court","email":"daleadams@boink.com","city":"Orick","state":"MD"} +``` +{% include copy-curl.html %} + +## Running SQL queries within Query Workbench + +Follow these steps to learn how to run SQL queries against your OpenSearch data using Query Workbench: + +1. Access Query Workbench. + - To access Query Workbench, go to OpenSearch Dashboards and choose **OpenSearch Plugins** > **Query Workbench** from the main menu. + +2. Run a query. + - Select the **SQL** button. In the query editor, type a SQL expression and then select the **Run** button to run the query. + + The following example query retrieves the first name, last name, and balance from the `accounts` index for accounts with a balance greater than 10,000 and sorts by balance in descending order: + + ```sql + SELECT + firstname, + lastname, + balance + FROM + accounts + WHERE + balance > 10000 + ORDER BY + balance DESC; + ``` + {% include copy.html %} + +3. View the results. + - View the results in the **Results** pane, which presents the query output in tabular format. You can filter and download the results as needed. + + The following image shows the query editor pane and results pane for the preceding SQL query: + + Query Workbench SQL query input and results output panes + +4. Clear the query editor. + - Select the **Clear** button to clear the query editor and run a new query. + +5. Examine how the query is processed. + - Select the **Explain** button to examine how OpenSearch processes the query, including the steps involved and order of operations. + + The following image shows the explanation of the SQL query that was run in step 2. + + Query Workbench SQL query explanation pane + +## Running PPL queries within Query Workbench + +Follow these steps to learn how to run PPL queries against your OpenSearch data using Query Workbench: + +1. Access Query Workbench. + - To access Query Workbench, go to OpenSearch Dashboards and choose **OpenSearch Plugins** > **Query Workbench** from the main menu. + +2. Run a query. + - Select the **PPL** button. In the query editor, type a PPL query and then select the **Run** button to run the query. + + The following is an example query that retrieves the `firstname` and `lastname` fields for documents in the `accounts` index with age greater than `18`: + + ```sql + search source=accounts + | where age > 18 + | fields firstname, lastname + ``` + {% include copy.html %} + +3. View the results. + - View the results in the **Results** pane, which presents the query output in tabular format. + + The following image shows the query editor pane and results pane for the PPL query that was run in step 2: + + Query Workbench PPL query input and results output panes + +4. Clear the query editor. + - Select the **Clear** button to clear the query editor and run a new query. + +5. Examine how the query is processed. + - Select the **Explain** button to examine how OpenSearch processes the query, including the steps involved and order of operations. + + The following image shows the explanation of the PPL query that was run in step 2. + + Query Workbench PPL query explanation pane + +Query Workbench does not support delete or update operations through SQL or PPL. Access to data is read-only. +{: .important} \ No newline at end of file diff --git a/_dashboards/quickstart.md b/_dashboards/quickstart.md new file mode 100644 index 00000000..eccdeb7d --- /dev/null +++ b/_dashboards/quickstart.md @@ -0,0 +1,118 @@ +--- +layout: default +title: OpenSearch Dashboards quickstart guide +nav_order: 2 +has_children: false +redirect_from: + - /dashboards/quickstart-dashboards/ +--- + +# OpenSearch Dashboards quickstart guide + +This quickstart guide provides tutorials on using OpenSearch Dashboards applications and tools. You can use these tutorials, either in your own environment or on [OpenSearch Playground](https://playground.opensearch.org/app/home#/), to learn the following fundamental concepts: + +- **Adding sample data:** Use preloaded visualizations, dashboards, and other tools to explore OpenSearch Dashboards before adding your own data. +- **Using the Discover application:** Analyze your data to gain insights. +- **Using the Dashboards application:** Create and store data visualizations. +- **Turning dark mode on or off:** Change the Dashboards theme. + +To dock or undock the navigation pane, select the {::nomarkdown}menu icon{:/} icon and then **Dock navigation** or **Undock navigation**. The OpenSearch Dashboards home page is shown in the following image. + +OpenSearch Dashboards home page + +{::nomarkdown}alert icon{:/} **Note**
Before you get started, make sure you've installed OpenSearch and OpenSearch Dashboards. For information about installation and configuration, see [Install and configure OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/) and [Install and configure OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/index/). +{: .note} + +## Adding sample data + +The following tutorials use the [**Sample flight data**](https://playground.opensearch.org/app/home#/tutorial_directory) dataset. +{: .note} + +To add sample data, follow these steps: + +1. On the OpenSearch Dashboards **Home** page, choose **Add sample data**. Alternatively, choose **Add data** on the upper-right toolbar. +2. On the **Add sample data** page, choose the dataset(s) you want to add to Dashboards. The following image shows the available sample datasets. + +Adding sample data window + +## Using the Discover application + +With [**Discover**]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/), you can: + +- Choose data for analysis, set a time range for that data, search it using [Dashboards Query Language (DQL)]({{site.url}}{{site.baseurl}}/dashboards/dql/), and filter the results. +- Analyze your data by querying and filtering, viewing results in a table, and examining documents. +- Create histograms to display the distribution of your data. + +Follow these steps to use the Discover tool: + +1. From the OpenSearch Dashboards navigation menu, choose **Discover**. +2. On the **Discover** page, choose the index pattern `opensearch_dashboards_sample_data_flights` from the dropdown menu on the upper left. +3. Select the {::nomarkdown}calendar icon{:/} icon to change the [time filter]({{site.url}}{{site.baseurl}}/dashboards/discover/time-filter/) from the default of **Last 15 minutes** to **Last 7 days**. +4. In the DQL search bar, enter `FlightDelay:true AND DestCountry: US AND FlightDelayMin >= 60` and select **Update**. Results are shown for US-bound flights delayed by 60 minutes or more. +5. Filter data by selecting **Add filter** from the DQL search bar and then selecting a **Field**, **Operator**, and **Value** from the dropdown lists in the **Edit Filter** pop-up window. For example, select `FlightDelayType`, **is**, and **Weather Delay**. + +The resulting view is shown in the following image. + +Discover output of steps 1 through 6 + +## Using the Dashboards application + +With **Dashboards**, you can: + +- Display data in a single view. +- Build dynamic dashboards. +- Create and share reports. +- Embed analytics to differentiate your applications. + +The **Dashboards** application creates and stores visualizations generated from your data. Follow these steps to use the application: + +1. On the OpenSearch Dashboards **Home** page, choose **Dashboards**. A list of dashboards generated from the sample data appears. +2. In the search toolbar, search for and select **[Flights] Global Flight Dashboard**. You'll see a dashboard preloaded with visualizations, including charts, maps, and data tables. +3. To add other panels to the dashboard, select the **Edit** button and then choose **Add** from the toolbar. The **Add panels** window opens. +4. In the search toolbar in the **Add panels** window, search for and select the existing panel **[Flights] Delay Buckets**. A pop-up message confirms that you've added the panel. +5. Select close `x` to exit the **Add panels** window. +6. The newly added panel is now displayed on the dashboard. + +The resulting view is shown in the following image. + +Add panel tutorial screen view + +For information about using a specific data visualization type, such as VisBuilder, go to [Building data visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/). For information about using dashboards and visualizations in **Observability**, go to [Observability]({{site.url}}{{site.baseurl}}/observing-your-data/). +{: .note} + +### Interacting with data using dashboards + +Interactive dashboards allow you to analyze data in more depth and filter it in several ways. With **Dashboards**, you can use dashboard-level filters to directly interact with data. + +Using the **[Flights] Global Flight Dashboard** dashboard, follow these steps to further analyze and filter the sample flight data: + +1. On the **[Flights] Airline Carrier** panel, choose **OpenSearch-Air**. The dashboard updates automatically. +2. Choose **Save** to save the dashboard. + +Alternatively, you can use the dashboard toolbar to apply filters by following these steps: + +1. In the dashboard toolbar, choose **Add filter**. +2. From the **Field**, **Operator**, and **Value** dropdown lists, choose **Carrier**, **is**, and **OpenSearch-Air**, respectively, as shown in the following image. +3. Choose **Save**. The dashboard updates automatically. + +The resulting view is shown in the following image. + +Screenshot of Dashboard tutorial panel view + +## Turning dark mode on or off + +Changing the Dashboards theme requires admin permissions. If you are an admin, follow these steps: + +1. Navigate to **Management** > **Dashboards Management** > **Advanced Settings**. +2. Scroll down to the **Appearance** section and locate the **Dark mode** option. +3. Use the toggle switch to turn dark mode on or off for all users of your Dashboards instance, as shown in the image following these steps. +4. Select the **Save changes** button and then the **Reload** button. The updated theme is applied immediately. + +Dark mode view + +## Next steps + +- Go to [Building data visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/) to learn more about Dashboards data visualizations. +- Go to [Creating dashboards]({{site.url}}{{site.baseurl}}/dashboards/quickstart-dashboards/) to learn more about creating dashboards. +- Go to [Analyzing data]({{site.url}}{{site.baseurl}}/dashboards/discover/index-discover/) to learn more about using Dashboards to analyze data. +- Go to [Ingest APIs]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) and [Ingest pipelines]({{site.url}}{{site.baseurl}}/ingest-pipelines/) to learn more about using OpenSearch for data ingestion. diff --git a/_dashboards/reporting.md b/_dashboards/reporting.md deleted file mode 100644 index 4a2247fa..00000000 --- a/_dashboards/reporting.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -layout: default -title: Reporting -nav_order: 20 ---- - - -# Reporting - -You can use OpenSearch Dashboards to create PNG, PDF, and CSV reports. To create reports, you must have the correct permissions. For a summary of the predefined roles and the permissions they grant, see the [security plugin]({{site.url}}{{site.baseurl}}/security-plugin/access-control/users-roles#predefined-roles). - -CSV reports have a non-configurable 10,000 row limit. They have no explicit size limit (e.g. in MB), but extremely large documents could cause report generation to fail with an out of memory error from the V8 JavaScript engine. -{: .tip } - - -## Create reports from Discovery, Visualize, or Dashboard - -Quickly generate an on-demand report from the current view. - -1. From the top menu bar, choose **Reporting**. -1. For dashboards or visualizations, choose **Download PDF** or **Download PNG**. From the Discover page, choose **Download CSV**. - - Reports generate asynchronously in the background and might take a few minutes, depending on the size of the report. A notification appears when your report is ready to download. - -1. To create a schedule-based report, choose **Create report definition**. Then proceed to [Create reports using a definition](#create-reports-using-a-definition). This option pre-fills many of the fields for you based on the visualization, dashboard, or data you were viewing. - - -## Create reports using a definition - -Definitions let you generate reports on a periodic schedule. - -1. From the navigation panel, choose **Reporting**. -1. Choose **Create**. -1. Under **Report settings**, enter a name and optional description for your report. -1. Choose the **Report Source** (i.e. the page from which the report is generated). You can generate reports from the **Dashboard**, **Visualize**, or **Discover** pages. -1. Select your dashboard, visualization, or saved search. Then choose a time range for the report. -1. Choose an appropriate file format for the report. -1. (Optional) Add a header or footer to the report. Headers and footers are only available for dashboard or visualization reports. -1. Under **Report trigger**, choose either **On-demand** or **Schedule**. - - For scheduled reports, select either **Recurring** or **Cron based**. You can receive reports daily or at some other time interval. Cron expressions give you even more flexiblity. See [Cron expression reference]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/cron/) for more information. - -1. Choose **Create**. - -## Troubleshooting - -### Chromium fails to launch with OpenSearch Dashboards - -While creating a report for dashboards or visualizations, you might see a the following error: - -![OpenSearch Dashboards reporting pop-up error message]({{site.url}}{{site.baseurl}}/images/reporting-error.png) - -This problem can occur for two reasons: - -- You don't have the correct version of `headless-chrome` to match the operating system on which OpenSearch Dashboards is running. Download the correct version [here](https://github.com/opensearch-project/dashboards-reports/releases/tag/chromium-1.12.0.0). - -- You're missing additional dependencies. Install the required dependencies for your operating system from the [additional libraries](https://github.com/opensearch-project/dashboards-reports/blob/main/dashboards-reports/rendering-engine/headless-chrome/README.md#additional-libaries) section. diff --git a/_dashboards/search-telemetry.md b/_dashboards/search-telemetry.md new file mode 100644 index 00000000..3b9af82b --- /dev/null +++ b/_dashboards/search-telemetry.md @@ -0,0 +1,44 @@ +--- +layout: default +title: Search telemetry +nav_order: 140 +--- + + +# Search telemetry + +You can use search telemetry to analyze search request performance by success or failure in OpenSearch Dashboards. OpenSearch stores telemetry data in the `.kibana_1` index. + +Because there are thousands of concurrent search requests from OpenSearch Dashboards, the heavy traffic can cause significant load in an OpenSearch cluster. + +OpenSearch clusters perform better with search telemetry turned off. +{: .tip } + +## Turning on search telemetry + +Search usage telemetry is turned off by default. To turn it on, you need to set `data.search.usageTelemetry.enabled` to `true` in the `opensearch_dashboards.yml` file. + +You can find the [OpenSearch Dashboards YAML file](https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/config/opensearch_dashboards.yml) in the opensearch-project repository on GitHub. + +Turning on telemetry in the `opensearch_dashboards.yml` file overrides the default search telemetry setting of `false` in the [Data plugin configuration file](https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/src/plugins/data/config.ts). +{: .note } + +### Turning search telemetry on or off + +The following table shows the `data.search.usageTelemetry.enabled` values you can set in `opensearch_dashboards.yml` to turn search telemetry on or off. + +OpenSearch Dashboards YAML value | Search telemetry status: on or off +:--- | :--- + `true` | On + `false` | Off + `none` | Off + +#### Sample opensearch_dashboards.yml with telemetry enabled + + This OpenSearch Dashboards YAML file excerpt shows the telemetry setting set to `true` to turn on search telemetry: + + ```json +# Set the value of this setting to false to suppress +# search usage telemetry to reduce the load of the OpenSearch cluster. + data.search.usageTelemetry.enabled: true +``` \ No newline at end of file diff --git a/_dashboards/sm-dashboards.md b/_dashboards/sm-dashboards.md new file mode 100644 index 00000000..3f6cd11d --- /dev/null +++ b/_dashboards/sm-dashboards.md @@ -0,0 +1,180 @@ +--- +layout: default +title: Snapshot Management +nav_order: 90 +redirect_from: + - /dashboards/admin-ui-index/sm-dashboards/ +--- + +# Snapshot Management + +[Snapshots]({{site.url}}{{site.baseurl}}/opensearch/snapshots/index/) are backups of a cluster’s indexes and state. The state includes cluster settings, node information, index metadata (mappings, settings, templates), and shard allocation. The Snapshot Management (SM) interface in OpenSearch Dashboards provides a unified solution for taking and restoring snapshots. + +An example of the interface is shown in the following image. + +![Snapshot Management user interface]({{site.url}}{{site.baseurl}}/images/dashboards/snapshots-UI.png) + +## Snapshots use cases + +Snapshots have two main uses: + +1. Recovering from failure + + For example, if cluster health goes red, you might restore the red indexes from a snapshot. + +2. Migrating from one cluster to another + + For example, if you’re moving from a proof of concept to a production cluster, you might take a snapshot of the former and restore it on the latter. + +## Creating a repository + +Before you create an SM policy, set up a repository for snapshots. + +1. From the OpenSearch Dashboards main menu, select **Management** > **Snapshot Management**. +2. In the left panel, under **Snapshot Management**, select **Repositories**. +3. Choose the **Create Repository** button. +4. Enter the repository name, type, and location. +5. (Optional) Select **Advanced Settings** and enter additional settings for this repository as a JSON object. +#### Example +```json + { + "chunk_size": null, + "compress": false, + "max_restore_bytes_per_sec": "40m", + "max_snapshot_bytes_per_sec": "40m", + "readonly": false + } +``` +6. Choose the **Add** button. + +{::nomarkdown}star icon{:/} **Note:** If you need to automate snapshot creation, you can use a snapshot policy. +{: .note purple} + +## Deleting a repository + +To delete a snapshot repository configuration, select the repository from the **Repositories** list and then choose the **Delete** button. + +## Creating an SM policy + +Create an SM policy to set up automatic snapshots. An SM policy defines an automated snapshot creation schedule and an optional automated deletion schedule. + +1. From the OpenSearch Dashboards main menu, select **Management** > **Snapshot Management**. +1. In the left panel, under **Snapshot Management**, select **Snapshot Policies**. +1. Select the **Create Policy** button. +1. In the **Policy settings** section: + 1. Enter the policy name. + 1. (Optional) Enter the policy description. +1. In the **Source and destination** section: + 1. Select or enter source indexes either as a list or as an index pattern. + 1. Select a repository for snapshots. To [create a new repository](#creating-a-repository), select the **Create** button. +1. In the **Snapshot schedule** section: + 1. Select the desired snapshot frequency or enter a custom cron expression for snapshot frequency. + 1. Select the start time and time zone. +1. In the **Retention period** section: + 1. Choose to retain all snapshots or specify retention conditions (the maximum age of retained snapshots). + 1. (Optional) In **Additional settings**, select the minimum and maximum number of retained snapshots, deletion frequency, and deletion start time. +1. In the **Notifications** section, select the snapshot activities you want to be notified about. +1. (Optional) In the **Advanced settings** section, select the desired options: + - **Include cluster state in snapshots** + - **Ignore unavailable indices** + - **Allow partial snapshots** +1. Select the **Create** button. + +## View, edit, or delete an SM policy + +You can view, edit, or delete an SM policy on the policy details page. + +1. From the OpenSearch Dashboards main menu, select **Management** > **Snapshot Management**. +1. In the left panel, under **Snapshot Management**, select **Snapshot Policies**. +1. Click on the **Policy name** of the policy you want to view, edit, or delete.
+The policy settings, snapshot schedule, snapshot retention period, notifications, and last creation and deletion are displayed in the policy details page.
If a snapshot creation or deletion fails, you can view information about the failure in the **Last Creation/Deletion** section. To view the failure message, click on the **cause** in the **Info** column. +1. To edit or delete the SM policy, select the **Edit** or **Delete** button. + +## Enable, disable, or delete SM policies + +1. From the OpenSearch Dashboards main menu, select **Management** > **Snapshot Management**. +1. In the left panel, under **Snapshot Management**, select **Snapshot Policies**. +1. Select one or more policies in the list. +1. To enable or disable selected SM policies, select the **Enable** or **Disable** button. To delete selected SM policies, in the **Actions** list, select the **Delete** option. + +## View snapshots + +1. From the OpenSearch Dashboards main menu, select **Management** > **Snapshot Management**. +1. In the left panel, under **Snapshot Management**, select **Snapshots**. +All automatically or manually taken snapshots appear in the list. +1. To view a snapshot, click on its **Name**. + +## Take a snapshot + +Follow these steps to take a snapshot manually: + +1. From the OpenSearch Dashboards main menu, select **Management** > **Snapshot Management**. +1. In the left panel, under **Snapshot Management**, select **Snapshots**. +1. Select the **Take snapshot** button. +1. Enter the snapshot name. +1. Select or enter source indexes either as a list or as an index pattern. +1. Select a repository for the snapshot. +1. (Optional) In the **Advanced options** section, select the desired options: + - **Include cluster state in snapshots** + - **Ignore unavailable indices** + - **Allow partial snapshots** +1. Choose the **Add** button. + +## Deleting a snapshot + +The **Delete** button [deletes]({{site.url}}{{site.baseurl}}/api-reference/snapshots/delete-snapshot/) a snapshot from a repository. + +1. To view a list of your repositories, choose **Repositories** under the **Snapshot Management** section. +2. To view a list of your snapshots, choose **Snapshots** under the **Snapshot Management** section. + +## Restoring a snapshot + +1. From the OpenSearch Dashboards main menu, select **Management** > **Snapshot Management**. +1. In the left panel, under **Snapshot Management**, select **Snapshots**. The **Snapshots** tab is selected by default. +1. Select the checkbox next to the snapshot you want to restore. An example is shown in the following image: + Snapshots{: .img-fluid} + + {::nomarkdown}star icon{:/} **Note:** You can only restore snapshots with the status of `Success` or `Partial`. The status of the snapshot is displayed in the **Snapshot status** column. + {: .note purple} +1. In the **Restore snapshot** flyout, select the options for restoring the snapshot. + + The **Restore snapshot** flyout lists the snapshot name and status. To view the list of indexes in the snapshot, select the number under **Indices** (for example, `27` in the following image). This number represents the number of indexes in the snapshot. + + Restore Snapshot + + For more information about the options in the **Restore snapshot** flyout, see [Restore snapshots]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#restore-snapshots). + + **Ignoring missing indexes** + + If you specify which indexes you want to restore from the snapshot and select the **Ignore unavailable indices** option, the restore operation ignores the indexes that are missing from the snapshot. For example, if you want to restore the `log1` and `log2` indexes, but `log2` is not in the snapshot, `log1` is restored and `log2` is ignored. If you don't select **Ignore unavailable indices**, the entire restore operation fails if an index to be restored is missing from a snapshot. + + **Custom index settings** + + You can choose to customize some settings for the indexes restored from a snapshot:
+  • Select the **Customize index settings** checkbox to provide new values for the specified index settings. All newly restored indexes will use these values instead of the ones in the snapshot.
+  • Select the **Ignore index settings** checkbox to specify the settings in the snapshot to ignore. All newly restored indexes will use the cluster defaults for these settings. + + The examples in the following image set `index.number_of_replicas` to `0`, `index.auto_expand_replicas` to `true`, and `index.refresh_interval` and `index.max_script_fields` to the cluster default values for all newly restored indexes. + + Custom settings + + For more information about index settings, see [Index settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/). + + For a list of settings that you cannot change or ignore, see [Restore snapshots]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#restore-snapshots). + + After choosing the options, select the **Restore snapshot** button. +1. (Optional) To monitor the restore progress, select **View restore activities** in the confirmation dialog. You can also monitor the restore progress at any time by selecting the **Restore activities in progress** tab, as shown in the following image. + + Restore Activities{: .img-fluid} + + You can view the percentage of the job that has been completed in the **Status** column. Once the snapshot restore is complete, the **Status** changes to `Completed (100%)`. + + {::nomarkdown}star icon{:/} **Note:** The **Restore activities in progress** panel is not persistent. It displays only the progress of the current restore operation. If multiple restore operations are running, the panel displays the most recent one. + {: .note purple} + To view the status of each index being restored, select the link in the **Indices being restored** column (in the preceding image, the `27 Indices` link). The **Indices being restored** flyout (shown in the following image) displays each index and its restore status. + + Restore Indices{: .img-fluid} + + After the restore operation is complete, the restored indexes are listed in the **Indices** panel. To view the indexes, in the left panel, under **Index Management**, choose **Indices**. + +View Indices{: .img-fluid} diff --git a/_dashboards/visualize/area.md b/_dashboards/visualize/area.md new file mode 100644 index 00000000..0f3b7863 --- /dev/null +++ b/_dashboards/visualize/area.md @@ -0,0 +1,60 @@ +--- +layout: default +title: Using area charts +parent: Building data visualizations +nav_order: 5 +--- + +# Using area charts + +An area chart is a line chart with the area between the line and the axis shaded with a color, and is a primary visualization type used to display time series data. You can create area charts in Dashboards using the Area visualization type or using the Time Series Visual Builder (TSVB), Vega, or VisBuilder visualization tools. For this tutorial, you'll use the Area visualization type. + +![Demonstration of the area chart tutorial steps]({{site.url}}{{site.baseurl}}/images/dashboards/area-tutorial.gif) + +# Try it: Create a simple aggregation-based area chart + +In this tutorial you'll create a simple area chart using sample data and aggregations in OpenSearch Dashboards by connecting to [http://localhost:5601](http://localhost:5601) from a browser. + +You have several aggregation options in Dashboards, and the choice influences your analysis. The use cases for aggregations vary from analyzing data in real time to using Dashboards to create a visualization dashboard. If you need an overview of aggregations in OpenSearch, see [Aggregations]({{site.url}}{{site.baseurl}}/opensearch/aggregations/) before starting this tutorial. + +Make sure you have [installed the latest version of Dashboards](https://opensearch.org/docs/latest/install-and-configure/install-dashboards/index/) and added the sample data before continuing with this tutorial. _This tutorial uses Dashboards version 2.4.1_. +{: .note} + +## Set up the area chart + +1. Access Dashboards by connecting to [http://localhost:5601](http://localhost:5601) from a browser. +1. Select **Visualize** from the menu and then select **Create visualization**. +1. Select **Area** from the window. +1. Select **opensearch_dashboards_sample_data_flights** in the **New Area/Choose a source** window. +1. Select the calendar icon and set the time filter to **Last 7 days**. +1. Select **Update**. + +## Add aggregations to the area chart + +Continuing with the area chart created in the preceding steps, you'll create a visualization that displays the top five logs for flights delayed for every three hours over the last seven days: + +1. Add a **Metrics** aggregation. + 1. Under **Metrics**, select the **Aggregation** dropdown list and choose **Average** and then select the **Field** dropdown list and choose **FlightDelayMin**. + 1. Under **Metrics**, select **Add** to add another Y-axis aggregation. + 1. Select the **Aggregation** dropdown list and choose **Max** and then select the **Field** dropdown list and choose **FlightDelayMin**. +1. Add a **Buckets** aggregation. + 1. Select **Add** to open the **Add Bucket** window and then select **X-axis**. + 2. From the **Aggregation** dropdown list, select **Date Histogram**. + 3. From the **Field** dropdown list, select **timestamp**. + 4. Select **Update**. +2. Add a sub-aggregation. + 1. Select **Add** to open the **Add Sub-Buckets** window and then select **Split series**. + 2. From the **Sub aggregation** dropdown list, select **Terms**. + 3. From the **Field** dropdown list, select **FlightDelay**. + 4. Select **Update** to reflect these parameters in the graph. + +You've now created the following aggregation-based area chart. + +![Resulting aggregation-based area chart]({{site.url}}{{site.baseurl}}/images/area-aggregation-tutorial.png) + +# Related links + +- [Visualize]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/) +- [Visualization types in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/) +- [Install and configure OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/index/) +- [Aggregations]({{site.url}}{{site.baseurl}}/opensearch/aggregations/) \ No newline at end of file diff --git a/_dashboards/gantt.md b/_dashboards/visualize/gantt.md similarity index 79% rename from _dashboards/gantt.md rename to _dashboards/visualize/gantt.md index 8eb33c11..875e35c1 100644 --- a/_dashboards/gantt.md +++ b/_dashboards/visualize/gantt.md @@ -1,7 +1,10 @@ --- layout: default -title: Gantt charts -nav_order: 10 +title: Using Gantt charts +parent: Building data visualizations +nav_order: 30 +redirect_from: + - /dashboards/gantt/ --- # Gantt charts @@ -15,11 +18,11 @@ To create a Gantt chart, perform the following steps: 1. In the visualizations menu, choose **Create visualization** and **Gantt Chart**. 1. Choose a source for the chart (e.g. some log data). 1. Under **Metrics**, choose **Event**. For log data, each log is an event. -1. Select the **Start Time** and **Duration** fields from your data set. The start time is the timestamp for the begining of an event. The duration is the amount of time to add to the start time. +1. Select the **Start Time** and **Duration** fields from your data set. The start time is the timestamp for the beginning of an event. The duration is the amount of time to add to the start time. 1. Under **Results**, choose the number of events to display on the chart. Gantt charts sequence events from earliest to latest based on start time. 1. Choose **Panel settings** to adjust axis labels, time format, and colors. 1. Choose **Update**. -![Gantt Chart]({{site.url}}{{site.baseurl}}/images/gantt-chart.png) +![Gantt Chart]({{site.url}}{{site.baseurl}}/images/dashboards/gantt-chart.png) This Gantt chart displays the ID of each log on the y-axis. Each bar is a unique event that spans some amount of time. Hover over a bar to see the duration of that event. diff --git a/_dashboards/visualize/geojson-regionmaps.md b/_dashboards/visualize/geojson-regionmaps.md new file mode 100644 index 00000000..663c4c2f --- /dev/null +++ b/_dashboards/visualize/geojson-regionmaps.md @@ -0,0 +1,79 @@ +--- +layout: default +title: Using coordinate and region maps +parent: Building data visualizations +has_children: true +nav_order: 15 +redirect_from: + - /dashboards/geojson-regionmaps/ +--- + +# Using coordinate and region maps + +OpenSearch has a standard set of GeoJSON files that provide a vector map with each region map. OpenSearch Dashboards also provides basic map tiles with a standard vector map to create region maps. You can configure the base map tiles using [Web Map Service (WMS)](https://www.ogc.org/standards/wms). For more information, see [Configuring WMS in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/maptiles/). + +For air gapped environments, OpenSearch Dashboards provides a self-host maps server. For more information, see [Using the self-host maps server]({{site.url}}{{site.baseurl}}/dashboards/selfhost-maps-server/) + +While you can't configure a server to support user-defined vector map layers, you can configure your own GeoJSON file and upload it for this purpose. +{: .note} + +## Customizing vector maps with GeoJSON + +If you have a specific locale that is not provided by OpenSearch Dashboards vector maps, such as a US county or US ZIP Code, you can create your own custom vector map with a GeoJSON file. To create a custom region map you would define a geographic shape such as a polygon with multiple coordinates. To learn more about the various geographic shapes that support a custom region map location, see [Geoshape field type]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-shape/). + +GeoJSON format allows you to encode geographic data structures. To learn more about the GeoJSON specification, go to [geojson.org](https://geojson.org/). + +You can use [geojson.io](https://geojson.io/#map=2/20.0/0.0) to extract GeoJSON files. + +> **PREREQUISITE** +> To use a custom vector map with GeoJSON, install these two required plugins: +> * OpenSearch Dashboards Maps [`dashboards-maps`](https://github.com/opensearch-project/dashboards-maps) front-end plugin +> * OpenSearch [`geospatial`](https://github.com/opensearch-project/geospatial) backend plugin +{: .note} + +### Step 1: Creating a region map visualization + +To create your own custom vector map, upload a JSON file that contains GEO data for your customized regional maps. The JSON file contains vector layers for visualization. + +1. Prepare a JSON file to upload. Make sure the file has either a .geojson or .json extension. +1. On the top menu bar, go to **OpenSearch Dashboards > Visualize**. +1. Select the **Create Visualization** button. +1. Select **Region Map**. +1. Choose a source. For example, **[Flights] Flight Log**. +1. In the right panel, select **Import Vector Map**. +1. In **Upload map**, select or drag and drop your JSON file and then enter **Map name prefix** (for example, `usa-counties`). Your map will have the prefix that you defined followed by the `-map` suffix (for example, `usa-counties-map`), as shown in the following image: + + Importing a GeoJSON file + +1. Select the **Import file** button and then select the **Refresh** button in the pop-up window confirming successful upload, as shown in the following image. + + Message confirming successful file upload + +### Step 2: Viewing the custom region map in OpenSearch Dashboards + +After you upload a custom GeoJSON file, you need to set the vector map layer to custom, and select your vector map: + +1. From **Layer Options > Layer settings**, select **Custom vector map**. +1. Under **Vector map**, select the name of the vector map that you just uploaded. +1. Optional: Under **Style settings**, increase **Border thickness** to see the borders more clearly. +1. Select the **Update** button. +1. View your region map in the Dashboards. For example, the following image shows the Los Angeles and San Diego county regions: + + Custom GeoJSON region map + +#### Example GeoJSON file + +The following example GeoJSON file provides coordinates for two US counties. + +```json +{ + "type": "FeatureCollection", + "name": "usa counties", + "features": [ + { "type": "Feature", "properties": { "iso2": "US", "iso3": "LA-CA", "name": "Los Angeles County", "country": "US", "county": "LA" }, "geometry": { "type": "Polygon", "coordinates":[[[-118.71826171875,34.07086232376631],[-118.69628906249999,34.03445260967645],[-118.56994628906249,34.02990029603907],[-118.487548828125,33.957030069982316],[-118.37219238281249,33.86129311351553],[-118.45458984375,33.75631505992707],[-118.33923339843749,33.715201644740844],[-118.22937011718749,33.75631505992707],[-118.1414794921875,33.678639851675555],[-117.9107666015625,33.578014746143985],[-117.75146484375,33.4955977448657],[-117.55920410156249,33.55512901742288],[-117.3065185546875,33.5963189611327],[-117.0703125,33.67406853374198],[-116.69677734375,34.06176136129718],[-116.9439697265625,34.28445325435288],[-117.18017578125,34.42956713470528],[-117.3779296875,34.542762387234845],[-117.62512207031251,34.56990638085636],[-118.048095703125,34.615126683462194],[-118.44909667968749,34.542762387234845],[-118.61938476562499,34.38877925439021],[-118.740234375,34.21180215769026],[-118.71826171875,34.07086232376631]]] } }, + { "type": "Feature", "properties": { "iso2": "US", "iso3": "SD-CA", "name": "San Diego County", "country": "US", "county": "SD" }, "geometry": { "type": "Polygon", "coordinates":[[[-117.23510742187501,32.861132322810946],[-117.2406005859375,32.75494243654723],[-117.1636962890625,32.68099643258195],[-117.14172363281251,32.58384932565662],[-117.09228515624999,32.46342595776104],[-117.0538330078125,32.29177633471201],[-116.96044921875,32.194208672875384],[-116.85607910156249,32.16631295696736],[-116.6748046875,32.20350534542368],[-116.3671875,32.319633552035214],[-116.1474609375,32.55144352864431],[-116.1639404296875,32.80574473290688],[-116.4111328125,33.073130945006625],[-116.72973632812499,33.08233672856376],[-117.09228515624999,32.99484290420988],[-117.2515869140625,32.96258644191747], [-117.23510742187501,32.861132322810946]]] } } + ] +} + +``` + diff --git a/_dashboards/visualize/maps-stats-api.md b/_dashboards/visualize/maps-stats-api.md new file mode 100644 index 00000000..7939a4e7 --- /dev/null +++ b/_dashboards/visualize/maps-stats-api.md @@ -0,0 +1,136 @@ +--- +layout: default +title: Maps Stats API +nav_order: 20 +grand_parent: Building data visualizations +parent: Using coordinate and region maps +has_children: false +--- + +# Maps Stats API +Introduced 2.7 +{: .label .label-purple } + +When you create and save a [map]({{site.url}}{{site.baseurl}}/dashboards/visualize/maps/) in OpenSearch Dashboards, the map becomes a saved object of type `map`. The Maps Stats API provides information about such saved objects in OpenSearch Dashboards. + +#### Example request + +You can access the Maps Stats API by providing its URL address in the following format: + +``` +/api/maps-dashboards/stats +``` + +The OpenSearch Dashboards endpoint address may contain a port number if it is specified in the OpenSearch configuration file. The specific URL format depends on the type of OpenSearch deployment and the network environment in which it is hosted. +{: .note} + +You can query the endpoint in two ways: + + - By accessing the endpoint address (for example, `http://localhost:5601/api/maps-dashboards/stats`) in a browser + + - By using the `curl` command in the terminal: + ```bash + curl -X GET http://localhost:5601/api/maps-dashboards/stats + ``` + {% include copy.html %} + +#### Example response + +The following is the response for the preceding request: + +```json +{ + "maps_total":4, + "layers_filters_total":4, + "layers_total":{ + "opensearch_vector_tile_map":2, + "documents":7, + "wms":1, + "tms":2 + }, + "maps_list":[ + { + "id":"88a24e6c-0216-4f76-8bc7-c8db6c8705da", + "layers_filters_total":4, + "layers_total":{ + "opensearch_vector_tile_map":1, + "documents":3, + "wms":0, + "tms":0 + } + }, + { + "id":"4ce3fe50-d309-11ed-a958-770756e00bcd", + "layers_filters_total":0, + "layers_total":{ + "opensearch_vector_tile_map":0, + "documents":2, + "wms":0, + "tms":1 + } + }, + { + "id":"af5d3b90-d30a-11ed-a605-f7ad7bc98642", + "layers_filters_total":0, + "layers_total":{ + "opensearch_vector_tile_map":1, + "documents":1, + "wms":0, + "tms":1 + } + }, + { + "id":"5ca1ec10-d30b-11ed-a042-93d8ff0f09ee", + "layers_filters_total":0, + "layers_total":{ + "opensearch_vector_tile_map":0, + "documents":1, + "wms":1, + "tms":0 + } + } + ] +} +``` + +## Response fields + +The response contains statistics for the following layer types: + +- Basemaps: Either a default OpenSearch map or custom base layer maps. + +- WMS layers: Custom WMS base layer maps. + +- TMS layers: Custom TMS base layer maps. + +- Document layers: The map's data layers. + +For more information about the layer types, see [Adding layers]({{site.url}}{{site.baseurl}}/dashboards/visualize/maps/#adding-layers). + +The following table lists all response fields. + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `maps_total` | Integer | The total number of maps registered as saved objects with the Maps plugin. | +| `layers_filters_total` | Integer | The total number of filters for all layers in all maps. This includes [layer-level filters]({{site.url}}{{site.baseurl}}/dashboards/visualize/maps/#filtering-data-at-the-layer-level) but excludes global filters like [shape filters]({{site.url}}{{site.baseurl}}/dashboards/visualize/maps/#drawing-shapes-to-filter-data). | +| `layers_total` | Object | Totals statistics for all layers in all maps. | +| `layers_total.opensearch_vector_tile_map` | Integer | The total number of OpenSearch basemaps in all maps. | +| `layers_total.documents` | Integer | The total number of document layers in all maps. | +| `layers_total.wms` | Integer | The total number of WMS layers in all maps. | +| `layers_total.tms` | Integer | The total number of TMS layers in all maps. | +| `maps_list` | Array | A list of all maps saved in OpenSearch Dashboards. | + +Each map in the `map_list` contains the following fields. + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `id` | String | The map's saved object ID. | +| `layers_filters_total` | Integer | The total number of filters for all layers in the map. This includes [layer-level filters]({{site.url}}{{site.baseurl}}/dashboards/visualize/maps/#filtering-data-at-the-layer-level) but excludes global filters like [shape filters]({{site.url}}{{site.baseurl}}/dashboards/visualize/maps/#drawing-shapes-to-filter-data) . | +| `layers_total` | Object | Totals statistics for all layers in the map. | +| `layers_total.opensearch_vector_tile_map` | Integer | The total number of OpenSearch basemaps in the map. | +| `layers_total.documents` | Integer | The total number of document layers in the map. | +| `layers_total.wms` | Integer | The total number of WMS layers in the map. | +| `layers_total.tms` | Integer | The total number of TMS layers in the map. | + +The saved object ID helps you navigate to a particular map because the ID is the last part of the map's URL. For example, in OpenSearch Playground, the address of the `[Flights] Flights Status on Maps Destination Location` map is `https://playground.opensearch.org/app/maps-dashboards/88a24e6c-0216-4f76-8bc7-c8db6c8705da`, where `88a24e6c-0216-4f76-8bc7-c8db6c8705da` is the saved object ID for this map. +{: .tip} diff --git a/_dashboards/visualize/maps.md b/_dashboards/visualize/maps.md new file mode 100644 index 00000000..8a4196e4 --- /dev/null +++ b/_dashboards/visualize/maps.md @@ -0,0 +1,263 @@ +--- +layout: default +title: Using maps +grand_parent: Building data visualizations +parent: Using coordinate and region maps +nav_order: 10 +redirect_from: + - /dashboards/maps/ +--- + +# Using maps + +With OpenSearch Dashboards, you can create maps to visualize your geographical data. OpenSearch lets you construct map visualizations with multiple layers, combining data across different indexes. You can build each layer from a different index pattern. Additionally, you can configure maps to show specific data at different zoom levels. OpenSearch maps are powered by the OpenSearch maps service, which uses vector tiles to render maps. + +## Creating a new map + +You can create a new map from the **Maps** or **Visualize** workflows by performing the following steps: + +- To create a new map from the **Maps** workflow, perform the following steps: + + 1. On the top menu bar, go to **OpenSearch Plugins > Maps**. + 1. Choose the **Create map** button. + +- To create a new map from the **Visualize** workflow, perform the following steps: + + 1. On the top menu bar, go to **OpenSearch Dashboards > Visualize**. + 1. Choose the **Create visualization** button. + 1. In the **New Visualization** dialog, choose **Maps**. + +You can now see the default OpenSearch basemap. + +To examine the **Default map** layer configuration, in the **Layers** panel on the upper left of the map, select **Default map**, as shown in the following image. + +Default map + +To hide the **Layers** panel, select the collapse (arrow) icon in the panel's upper-right corner. +{: .tip} + +## Layer settings + +To change the default map settings, select **Default map** in the **Layers** panel. Under **Layer settings**, you can change the layer name and description and configure zoom levels and opacity for your layer: + +- **Zoom levels**: By default, a layer is visible at all zoom levels. If you want to make a layer visible only for a certain range of zoom levels, you can specify the zoom levels either by entering them in the text boxes or by sliding the range slider to the desired values. + +- **Opacity**: If your map contains multiple layers, one layer can obscure another one. In this case, you may want to reduce the opacity of the top layer so you can see both layers at the same time. + +## Adding layers + +To add a layer to the map, in the **Layers** panel, select the **Add layer** button. The **Add layer** dialog is shown in the following image. + +Add layer + +You can add **base layers** or **data layers** to the map: + +- A **base layer** serves as a basemap. To use your own or a third-party map as a base layer, [add it as a **Custom map**](#adding-a-custom-map). + +- **Data layers** let you visualize data from various data sources. + +## Adding a custom map + +OpenSearch supports Web Map Service (WMS) or Tile Map Service (TMS) custom maps. To add a TMS custom map, perform the following steps: + +1. In the **Layers** panel, select the **Add layer** button. +1. From the **Add layer** dialog, select **Base layer > Custom map**. + Follow the next steps in the **New layer** dialog, which is shown in the following image. + + Add custom map + +1. In the **Custom type** dropdown list, select **Tile Map Service (TMS)**. +1. Enter the TMS URL. +1. (Optional) In **TMS attribution**, enter a TMS attribution for the basemap. For example, if you're using a custom basemap, enter the custom map name. This name will be displayed in the lower-right corner of the map. +1. Select the **Settings** tab to edit the layer settings. +1. Enter the layer name in **Name**. +1. (Optional) Enter the layer description in **Description**. +1. (Optional) Select the zoom levels and opacity for this layer. +1. Select the **Update** button. + +## Adding a document layer + +Adding document layers lets you visualize your data. You can add one index pattern per document layer. To view multiple index patterns, create multiple layers. + +Document layers can display geopoint and geoshape document fields. +{: .note} + +The following example assumes that you have the `opensearch_dashboards_sample_data_flights` dataset installed. If you don't have this dataset installed, perform the following steps: + +1. On the top left, select the home icon. +1. Select **Add sample data**. +1. In the **Sample flight data** panel, select the **Add data** button. + +Add a document layer as follows: + +1. In the **Layers** panel, select the **Add layer** button. +1. From the **Add layer** dialog, select **Data layer > Documents**. +1. In **Data source**, select `opensearch_dashboards_sample_data_flights`. Alternatively, you can enter another index pattern to visualize. +1. In **Geospatial field**, select a geospatial field (geopoint or geoshape) to be displayed in the visualization. In this example, select `DestLocation`. +1. (Optional) Select the **Style** tab to change the fill color, border color, border thickness, or marker size. +1. Select the **Settings** tab to edit layer settings. +1. Enter `Flight destination` in **Name**. +1. Select the **Update** button. +1. To see more data, in the upper-right corner select the calendar icon dropdown list, then under **Quick select**, choose **Last 15 days** and select the **Apply** button. + +You should see the flight destination data, as in the following image. + +Flight destination map + +## Filtering data + +To show a subset of the data in the index, filter the data. You can either filter data at the layer level or draw shapes on the map to filter all layer data globally. + +### Filtering data at the layer level + +To filter data at the layer level, select the layer and add a filter to it. + +The following example shows how to filter the flight destination data to display only United States destinations: + +1. In the **Layers** panel, select **Flight destination**. +1. Select **Filters**. +1. Select **Add filter**. +1. In **Edit filter**, select **DestCountry** in **Field**. +1. In **Operator**, select **is**. +1. In **Value**, select **US**. +1. Select the **Save** button. +1. Select the **Update** button. + +For large datasets, you may want to avoid loading data for the full map. To load data only for a specific geographic area, select **Only request data around map extent**. +{: .tip} + +### Drawing shapes to filter data + +You can filter your data globally by drawing [shapes]({{site.url}}{{site.baseurl}}/field-types/geo-shape) on the map. To draw a rectangle or polygon on the map, perform the following steps: + +1. Select the **Rectangle** or **Polygon** icon on the right side of the map. +1. In the **Filter label** field, enter a name for the filter. +1. Choose a spatial relation type. By default, **Intersects** is selected. See [Spatial relations]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/xy#spatial-relations) for more information about spatial relationship types. +1. Select the **Draw Rectangle** or **Draw Polygon** button. +1. Draw the shape over the map area that you want to select: + - For a rectangle, select any starting point on the map (this point becomes a rectangle vertex). Then hover (do not drag) to another point on the map and select it (this point becomes the opposite vertex). + - For a polygon, select any starting point on the map (this point becomes a polygon vertex) and hover (do not drag) to each subsequent vertex and select that point. Finally, make sure to select the starting point again to close the polygon, as shown in the following image. + +Drawing a polygon on a map + +### Disabling the shape filter for a map layer + +By default, the shape filter is applied globally to all layers on the map. If you want to disable your shape filter for a map layer, perform the following steps: +1. Select the layer from the **Layers** panel. +1. In the **Filters** section, deselect **Apply global filters**. +1. Select the **Update** button. + +### Modifying an existing shape filter + +To modify an existing shape filter, select your filter on the top left above the map. You can perform the following operations on an existing filter: + +- **Edit filter**: Change the filter name or modify the shape's coordinates. +- **Exclude results**: Negate the filter, that is, show all data points _except_ those to which the filter applies. +- **Temporarily disable**: Disable the filter until you select **Re-enable**. +- **Delete**: Remove your filter completely. + +## Using tooltips to visualize additional data + +Document layers show geopoint and geoshape document fields as locations on the map. To add more information to the locations, you can use tooltips. For example, you may want to show flight delay, destination weather, and destination country information in the **Flight destination** layer. Perform the following steps to configure tooltips to show additional data: + +1. In the **Layers** panel, select **Flight destination**. +1. Select **Tooltips**. +1. Select the **Show tooltips** checkbox. +1. In the **Tooltip fields** dropdown list, select the fields that you'd like to display. In this example, select `FlightDelay`, `DestWeather`, and `DestCountry`. +1. Select the **Update** button. + +To view tooltips, hover over the geographical point you're interested in. One tooltip can display many data points. For example, in the **Flight destination** layer there are multiple flights for a single destination city. To paginate over the flights, select the city you're interested in and use the arrows in the tooltip, as shown in the following image. + +Flight destination tooltip + +If a point on the map contains data from multiple layers, one tooltip can display data from multiple layers. To see all layers, select **All layers**. To choose a particular layer, select the layer name in the tooltip layer selection panel, as shown in the following image. + +Tooltip with a layer selection panel + +## Adding labels to layers + +Adding a label to a layer lets you visualize additional data on the map. For example, you may want to see the origin weather in the **Flight destination** layer. Perform the following steps to add a label to the **Flight destination** layer: + +1. In the **Layers** panel, select **Flight destination**. +1. In the **Style** tab, select the **Add label** checkbox. +1. You can choose to add a label based on fixed text to all data points in the layer or to use a field value as the label text. + - To add a fixed-text label, under **Label text**, select **Fixed** and enter your desired label text. + - To add a label based on a field value, under **Label text**, select **Field value** and select the field name. In this example, select `OriginWeather`. +1. (Optional) Change the label size, color, border color, or border width. +1. Select the **Update** button. + +The label with the origin weather is visible on the map and also added to the tooltips, as shown in the following image. + +Label based on field type added to map and tooltips + +## Reordering, hiding, and deleting layers + +The **Layers** panel lets you reorder, hide, and delete layers: + +- Layers on a map are stacked on top of each other. To reorder layers, use the handlebar (two horizontal lines) icon next to the layer name to drag the layer to the desired position. + +- If you'd like to hide a layer, select the show/hide (eye) icon next to the layer name. Toggle the show/hide icon to show the layer again. + +- To delete a layer, select the delete (trash can) icon next to the layer name. + +## Refreshing data for a real-time dataset + +If you want to visualize a real-time dataset, after adding layers to the map, perform the following steps to set the refresh interval: + +1. Select the calendar icon in the upper-right corner. +1. Under **Refresh every**, select or enter the refresh interval (for example, 1 second). +1. Select the **Start** button. + +Refreshing a map + +## Saving a map + +To save a map with all the layers that you set up, perform the following steps: + +1. Select the **Save** button in the upper-right corner. +1. In the **Save map** dialog, enter the map name in the **Title** text box. +1. (Optional) In the **Description** text box, enter the map description. +1. Select the **Save** button. + +To open your saved map, choose **Maps** in the upper-left corner. The list of saved maps is displayed. + +## Adding a map to a dashboard + +You can add a new or existing map to a new or existing dashboard by performing the following steps: + +- To add a map to a new dashboard, first create the dashboard as follows: + + 1. On the top menu bar, go to **OpenSearch Dashboards > Dashboard**. + 1. Choose the **Create dashboard** button. + 1. Choose the **Create new** button. + +- To add a map to an existing dashboard, first open the dashboard as follows: + 1. On the top menu bar, go to **OpenSearch Dashboards > Dashboard**. + 1. Select the dashboard you want to open from the list. + 1. In the upper-right corner, choose **Edit**. + +Once you've opened a dashboard, you can add a new or existing map to it. + +### Adding an existing map + +1. From the top menu, choose **Add**. +1. In the **Types** dropdown list, select **Maps**. +1. Select the map you want to add from the list. + +### Adding a new map + +1. From the top menu, choose the **Create new** button. +1. In the **New Visualization** dialog, choose **Maps**. +1. Edit the default map by adding a basemap, layers, or tooltips. +1. In the upper-right corner, choose the **Save** button. +1. In the **Save map** dialog, enter the **Title** and optional **Description** of the map. +1. Select **Add to Dashboard after saving** (this option is selected by default). +1. Choose the **Save and return** button. + +## Editing a map from a dashboard + +1. In the dashboard, choose the gear icon in the upper-right corner of the map you want to edit. +1. Choose **Edit maps**. +1. Edit the map. +1. In the upper-right corner, choose the **Save** button. +1. In the **Save map** dialog, choose the **Save and return** button. \ No newline at end of file diff --git a/_dashboards/visualize/maptiles.md b/_dashboards/visualize/maptiles.md new file mode 100644 index 00000000..6b8cc06e --- /dev/null +++ b/_dashboards/visualize/maptiles.md @@ -0,0 +1,34 @@ +--- +layout: default +title: Configuring a Web Map Service (WMS) +grand_parent: Building data visualizations +parent: Using coordinate and region maps +nav_order: 30 +redirect_from: + - /dashboards/maptiles/ +--- + +{%- comment -%}The `/docs/opensearch-dashboards/maptiles/` redirect is specifically to support the UI links in OpenSearch Dashboards 1.0.0.{%- endcomment -%} + +# Configuring a Web Map Service (WMS) + +The Open Geospatial Consortium (OGC) Web Map Service (WMS) specification is an international specification for requesting dynamic maps on the web. OpenSearch Dashboards includes default map tiles. For specialized maps, you can configure a WMS on OpenSearch Dashboards following these steps: + +1. Log in to OpenSearch Dashboards at `https://:`. For example, you can connect to OpenSearch Dashboards by connecting to [https://localhost:5601](https://localhost:5601). The default username and password are `admin`. +2. Choose **Management** > **Advanced Settings**. +3. Locate `visualization:tileMap:WMSdefaults`. +4. Change `enabled` to `true` and add the URL of a valid WMS server, as shown in the following example: + + ```json + { + "enabled": true, + "url": "", + "options": { + "format": "image/png", + "transparent": true + } + } + ``` + +Web map services may have licensing fees or restrictions, and you are responsible for complying with any such fees or restrictions. +{: .note } diff --git a/_dashboards/visualize/selfhost-maps-server.md b/_dashboards/visualize/selfhost-maps-server.md new file mode 100644 index 00000000..925c5449 --- /dev/null +++ b/_dashboards/visualize/selfhost-maps-server.md @@ -0,0 +1,110 @@ +--- +layout: default +title: Using the self-host maps server +grand_parent: Building data visualizations +parent: Using coordinate and region maps +nav_order: 40 +redirect_from: + - /dashboards/selfhost-maps-server/ +--- + +# Using the self-host maps server + +The self-host maps server for OpenSearch Dashboards allows users to access the default maps service in air-gapped environments. OpenSearch-compatible map URLs include a map manifest with map tiles and vectors, the map tiles, and the map vectors. + +The following sections provide steps for setting up and using the self-host maps server with OpenSearch Dashboards. + +You can access the `maps-server` image via the official OpenSearch [Docker Hub repository](https://hub.docker.com/u/opensearchproject). +{: .note} + +## Pulling the Docker image + +Open your terminal and run the following command: + +`docker pull opensearchproject/opensearch-maps-server:1.0.0` + +## Setting up the server + +You must set up the map tiles before running the server. You have two setup options: Use the OpenSearch-provided maps service tiles set, or generate the raster tiles set. + +### Option 1: Use the OpenSearch-provided maps service tiles set + +Create a Docker volume to hold the tiles set: + +`docker volume create tiles-data` + +Download the tiles set from the OpenSearch maps service. Two planet tiles sets are available based on the desired zoom level: + +- Zoom Level 8 (https://maps.opensearch.org/offline/planet-osm-default-z0-z8.tar.gz) +- Zoom level 10 (https://maps.opensearch.org/offline/planet-osm-default-z0-z10.tar.gz) + +The planet tiles set for zoom level 10 (2 GB compressed/6.8 GB uncompressed) is approximately 10 times larger than the set for zoom level 8 (225 MB compressed/519 MB uncompressed). +{: .note} + +``` +docker run \ + -e DOWNLOAD_TILES=https://maps.opensearch.org/offline/planet-osm-default-z0-z8.tar.gz \ + -v tiles-data:/usr/src/app/public/tiles/data/ \ + opensearch/opensearch-maps-server \ + import +``` + +### Option 2: Generate the raster tiles set + +To generate the raster tiles set, use the [raster tile generation pipeline](https://github.com/opensearch-project/maps/tree/main/tiles-generation/cdk) and then use the tiles set absolute path to create a volume to start the server. + +## Starting the server + +Use the following command to start the server using the Docker volume `tiles-data`. The following command is an example using host URL "localhost" and port "8080": + +``` +docker run \ + -v tiles-data:/usr/src/app/public/tiles/data/ \ + -e HOST_URL='http://localhost' \ + -p 8080:8080 \ + opensearch/opensearch-maps-server \ + run +``` + +Or, if you generated the raster tiles set, run the server using that tiles set: + +``` +docker run \ + -v /absolute/path/to/tiles/:/usr/src/app/dist/public/tiles/data/ \ + -p 8080:8080 \ + opensearch/opensearch-maps-server \ + run +``` +To access the tiles set, open the URLs in a browser on the host or use the `curl` command `curl http://localhost:8080/manifest.json`. + + +Confirm the server is running by opening each of the following links in a browser on your host or with a `curl` command (for example, `curl http://localhost:8080/manifest.json`). + +* Map manifest URL: `http://localhost:8080/manifest.json` +* Map tiles URL: `http://localhost:8080/tiles/data/{z}/{x}/{y}.png` +* Map tiles demo URL: `http://localhost:8080/` + +## Using the self-host maps server with OpenSearch Dashboards + +You can use the self-host maps server with OpenSearch Dashboards by either adding the parameter to `opensearch_dashboards.yml` or configuring the default WMS properties in OpenSearch Dashboards. + +### Option 1: Configure opensearch_dashboards.yml + +Configure the manifest URL in `opensearch_dashboards.yml`: + +`map.opensearchManifestServiceUrl: "http://localhost:8080/manifest.json"` + +### Option 2: Configure Default WMS properties in OpenSearch Dashboards + +1. On the OpenSearch Dashboards console, select **Dashboards Management** > **Advanced Settings**. +2. Locate `visualization:tileMap:WMSdefaults` under **Default WMS properties**. +3. Change `"enabled": false` to `"enabled": true` and add the URL for the valid map server. + +## Licenses + +Tiles are generated per [Terms of Use for Natural Earth vector map data](https://www.naturalearthdata.com/about/terms-of-use/) and [Copyright and License for OpenStreetMap](https://www.openstreetmap.org/copyright). + +## Related articles + +* [Configuring a Web Map Service (WMS)]({{site.url}}{{site.baseurl}}/dashboards/visualize/maptiles/) +* [Using coordinate and region maps]({{site.url}}{{site.baseurl}}/dashboards/visualize/geojson-regionmaps/) diff --git a/_dashboards/visualize/visbuilder.md b/_dashboards/visualize/visbuilder.md new file mode 100644 index 00000000..7b32e818 --- /dev/null +++ b/_dashboards/visualize/visbuilder.md @@ -0,0 +1,57 @@ +--- +layout: default +title: Using VisBuilder +parent: Building data visualizations +nav_order: 100 +redirect_from: + - /dashboards/drag-drop-wizard/ +--- + +# Using VisBuilder + +VisBuilder is an experimental feature and shouldn't be used in a production environment. For updates on its progress, or if you want to leave feedback that helps improve the feature, see the [GitHub issue](https://github.com/opensearch-project/OpenSearch-Dashboards/issues/2280). +{: .warning} + +You can use the VisBuilder visualization type in OpenSearch Dashboards to create data visualizations by using a drag-and-drop gesture. With VisBuilder you have: + +* An immediate view of your data without the need to preselect the visualization output. +* The flexibility to change visualization types and index patterns quickly. +* The ability to easily navigate between multiple screens. + +VisBuilder new visualization start page + +## Try VisBuilder in the OpenSearch Dashboards playground + +If you'd like to try out VisBuilder without installing OpenSearch locally, you can do so in the [Dashboards playground](https://playground.opensearch.org/app/vis-builder#/). VisBuilder is enabled by default. + +## Try VisBuilder locally + +VisBuilder is enabled by default. If you want to disable it, set the feature `flag vis_builder.enabled:` to `false` in the `opensearch_dashboards.yml` file as follows: + +``` +# Set the value of this setting to false to disable VisBuilder +# functionality in Visualization. +vis_builder.enabled: false +``` + +Follow these steps to create a new visualization using VisBuilder in your environment: + +1. Open Dashboards: + - If you're not running the Security plugin, go to http://localhost:5601. + - If you're running the Security plugin, go to https://localhost:5601 and log in with your username and password (default is admin/admin). + +2. Confirm that the **Enable experimental visualizations** option is turned on. + - From the top menu, select **Management** > **Dashboards Management** > **Advanced Settings**. + - Select **Visualization** and verify that the option is turned on. + + Enable experimental visualizations + +3. From the top menu, select **Visualize** **>** **Create visualization** **>** **VisBuilder**. + + Select the VisBuilder visualization type + +4. Drag and drop field names from the left column into the **Configuration** panel to generate a visualization. + +Here’s an example visualization. Your visualization will look different depending on your data and the fields you select. + +Visualization generated using sample data diff --git a/_dashboards/visualize/viz-index.md b/_dashboards/visualize/viz-index.md new file mode 100644 index 00000000..75407a6b --- /dev/null +++ b/_dashboards/visualize/viz-index.md @@ -0,0 +1,128 @@ +--- +layout: default +title: Building data visualizations +nav_order: 40 +has_children: true +--- + +# Building data visualizations + +By visualizing your data, you translate complex, high-volume, or numerical data into a visual representation that is easier to process. OpenSearch Dashboards gives you data visualization tools to improve and automate the visual communication process. By using visual elements like charts, graphs, or maps to represent data, you can advance business intelligence and support data-driven decision-making and strategic planning. + +## Understanding the visualization types in OpenSearch Dashboards + +Dashboards has several visualization types to support your data analysis needs. The following sections provide an overview of the visualization types in Dashboards and their common use cases. + +### Area charts + +Area charts depict changes over time, and they are commonly used to show trends. Area charts more efficiently identify patterns in log data, such as sales data for a time range and trends over that time. See [Using area charts]({{site.url}}{{site.baseurl}}/dashboards/visualize/area/) to learn more about how to create and use them in Dashboards. + + Example area chart in OpenSearch Dashboards + +### Bar charts + +Bar charts (vertical or horizontal) compare categorical data and depict changes of a variable over a period of time. + +Vertical bar chart | Horizontal bar chart +:-------------------------:|:-------------------------: +Example vertical bar chart in OpenSearch Dashboards | Example horizontal bar chart in OpenSearch Dashboards + +### Controls + +Controls is a panel, instead of a visualization type, added to a dashboard to filter data. Controls gives users the capability to add interactive inputs to a dashboard. You can create two types of controls in Dashboards: **Options list** and **Range slider**. **Options list** is a dropdown options list that allows filtering of data by a terms aggregation, such as `machine.os.keyword`. **Range slider** allows filtering within specified value ranges, such as `hour_of_day`. + +Example visualization using controls to filter data in OpenSearch Dashboards + +### Data tables + +Data tables, or tables, show your raw data in tabular form. + +Example data table in OpenSearch Dashboards + +### Gantt charts + +Gantt charts show the start, end, and duration of unique events in a sequence. Gantt charts are useful in trace analytics, telemetry, and anomaly detection use cases where you want to understand interactions and dependencies between various events in a schedule. **Gantt chart** is currently a plugin, instead of built-in, visualization type in Dashboards. See [Gantt charts]({{site.url}}{{site.baseurl}}/dashboards/visualize/gantt/) to learn how to create and use them in Dashboards. + +Example Gantt chart in OpenSearch Dashboards + +### Gauge charts + +Gauge charts look similar to an analog speedometer that reads left to right from zero. They display how much there is of the thing you are measuring, and this measurement can exist alone or in relation to another measurement, such as tracking performance against benchmarks or goals. + +Example gauge chart in OpenSearch Dashboards + +### Heat maps + +A heat map is a view of a histogram (a graphical representation of the distribution of numerical data) over time. Instead of using bar height as a representation of frequency, as with a histogram, heat maps display data in a tabular form using colors to differentiate where values fall in a range. + +Example heat map in OpenSearch Dashboards + +### Line charts + +Line charts compare changes in measured values over a period of time, such as gross sales by month or gross sales and net sales by month. + +Example line graph in OpenSearch Dashboards + +### Maps + +You can create two types of maps in Dashboards: Coordinate maps and Region maps. Coordinate maps show the difference between data values for each location by size. Region maps show the difference between data values for each location by varying shades of color. See [Using maps]({{site.url}}{{site.baseurl}}/dashboards/visualize/maps/) to learn more about maps capabilities in Dashboards. + +#### Coordinate maps + +Coordinate maps show location-based data on a map. Use coordinate maps to visualize GPS data (latitude and longitude coordinates) on a map. For information about OpenSearch-supported coordinate field types, see [Geographic field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-shape/) and [Cartesian field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/xy/). + +Example coordinate map in OpenSearch Dashboards + +#### Region maps + +Region maps show patterns and trends across geographic locations. A region map is one of the basemaps in Dashboards. For information about creating custom vector maps in Dashboards, see [Using coordinate and region maps]({{site.url}}{{site.baseurl}}/dashboards/visualize/geojson-regionmaps/) to learn how to create and use maps in Dashboards. + +Example region map in OpenSearch Dashboards + +### Markdown + +Markdown is a the markup language used in Dashboards to provide context to your data visualizations. Using Markdown, you can display information and instructions along with the visualization. + +Example coordinate map in OpenSearch Dashboards + +### Metric values + +Metric values, or number charts, compare values in different measures. For example, you can create a metrics visualization to compare two values, such as actual sales compared to sales goals. + +Example metric chart in OpenSearch Dashboards + +### Pie charts + +Pie charts compare values for items in a dimension, such as a percentage of a total amount. + +Example pie chart in OpenSearch Dashboards + +### TSVB + +The time-series visual builder (TSVB) is a data visualization tool in Dashboards used to create detailed time-series visualizations. For example, you can use TSVB to build visualizations that show data over time, such as flights by status over time or flight delays by delay type over time. Currently, TSVB can be used to create the following Dashboards visualization types: Area, Line, Metric, Gauge, Markdown, and Data Table. + +Example TSVB in OpenSearch Dashboards + +### Tag cloud + +Tag (or word) clouds are a way to display how often a word is used in relation to other words in a dataset. The best use for this type of visual is to show word or phrase frequency. + +Example Tag cloud in OpenSearch Dashboards + +### Timeline + +Timeline is a data visualization tool in Dashboards that you can use to create time-series visualizations. Currently, Timeline can be used to create the following Dashboards visualization types: Area and Line. + +Example Timeline in OpenSearch Dashboards + +### VisBuilder + +VisBuilder is a drag-and-drop data visualization tool in Dashboards. It gives you an immediate view of your data without the need to preselect the data source or visualization type output. Currently, VisBuilder can be used to create the following Dashboards visualization types: Area, Bar, Line, Metric, and Data Table. See [VisBuilder]({{site.url}}{{site.baseurl}}/dashboards/visualize/visbuilder/) to learn how to create and use drag-and-drop visualizations in Dashboards. + +Example VisBuilder in OpenSearch Dashboards + +### Vega + +[Vega](https://vega.github.io/vega/) and [Vega-Lite](https://vega.github.io/vega-lite/) are open-source, declarative language visualization grammars for creating, sharing, and saving interactive data visualizations. Vega visualizations give you the flexibility to visualize multidimensional data using a layered approach in order to build and manipulate visualizations in a structured manner. Vega can be used to create customized visualizations using any Dashboards visualization type. + +Example Vega visualization with JSON specification in OpenSearch Dashboards diff --git a/_data-prepper/common-use-cases/anomaly-detection.md b/_data-prepper/common-use-cases/anomaly-detection.md new file mode 100644 index 00000000..e7003558 --- /dev/null +++ b/_data-prepper/common-use-cases/anomaly-detection.md @@ -0,0 +1,210 @@ +--- +layout: default +title: Anomaly detection +parent: Common use cases +nav_order: 5 +--- + +# Anomaly detection + +You can use Data Prepper to train models and generate anomalies in near real time on time-series aggregated events. You can generate anomalies either on events generated within the pipeline or on events coming directly into the pipeline, like OpenTelemetry metrics. You can feed these tumbling window aggregated time-series events to the [`anomaly_detector` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/anomaly-detector/), which trains a model and generates anomalies with a grade score. Then you can configure your pipeline to write the anomalies to a separate index to create document monitors and trigger fast alerting. + +## Metrics from logs + +The following pipeline receives logs from an HTTP source like FluentBit, extracts important values from the logs by matching the value in the `log` key against the [Grok Apache Common Log Format](https://httpd.apache.org/docs/2.4/logs.html#accesslog), and then forwards the grokked logs to both the `log-to-metrics-pipeline` pipeline and an OpenSearch index named `logs`. + +The `log-to-metrics-pipeline` pipeline receives the grokked logs from the `apache-log-pipeline-with-metrics` pipeline, aggregates them, and derives histogram metrics based on the values in the `clientip` and `request` keys. It then sends the histogram metrics to an OpenSearch index named `histogram_metrics` as well as to the `log-to-metrics-anomaly-detector-pipeline` pipeline. + +The `log-to-metrics-anomaly-detector-pipeline` pipeline receives the aggregated histogram metrics from the `log-to-metrics-pipeline` pipeline and sends them to the `anomaly_detector` processor to detect anomalies by using the Random Cut Forest algorithm. If the algorithm detects anomalies, it sends them to an OpenSearch index named `log-metric-anomalies`. + +```json +apache-log-pipeline-with-metrics: + source: + http: + # Provide the path for ingestion. ${pipelineName} will be replaced with pipeline name configured for this pipeline. + # In this case it would be "/apache-log-pipeline-with-metrics/logs". This will be the FluentBit output URI value. + path: "/${pipelineName}/logs" + processor: + - grok: + match: + log: [ "%{COMMONAPACHELOG_DATATYPED}" ] + sink: + - opensearch: + ... + index: "logs" + - pipeline: + name: "log-to-metrics-pipeline" + +log-to-metrics-pipeline: + source: + pipeline: + name: "apache-log-pipeline-with-metrics" + processor: + - aggregate: + # Specify the required identification keys + identification_keys: ["clientip", "request"] + action: + histogram: + # Specify the appropriate values for each the following fields + key: "bytes" + record_minmax: true + units: "bytes" + buckets: [0, 25000000, 50000000, 75000000, 100000000] + # Pick the required aggregation period + group_duration: "30s" + sink: + - opensearch: + ... + index: "histogram_metrics" + - pipeline: + name: "log-to-metrics-anomaly-detector-pipeline" + +log-to-metrics-anomaly-detector-pipeline: + source: + pipeline: + name: "log-to-metrics-pipeline" + processor: + - anomaly_detector: + # Specify the key on which to run anomaly detection + keys: [ "bytes" ] + mode: + random_cut_forest: + sink: + - opensearch: + ... + index: "log-metric-anomalies" +``` +{% include copy-curl.html %} + +## Metrics from traces + +You can derive metrics from traces and find anomalies in those metrics. In this example, the `entry-pipeline` pipeline receives trace data from the OpenTelemetry Collector and forwards it to the following pipelines: + +- `span-pipeline` –- Extracts the raw spans from the traces. The pipeline sends the raw spans to any indexes OpenSearch prefixed with `otel-v1-apm-span`. + +- `service-map-pipeline` –- Aggregates and analyzes the traces to create documents that represent connections between services. The pipeline sends these documents to an OpenSearch index named `otel-v1-apm-service-map`. You can then see a visualization of the service map through the [Trace Analytics]({{site.url}}{{site.baseurl}}/observing-your-data/trace/index/) plugin for OpenSearch Dashboards. + +- `trace-to-metrics-pipeline` -- Aggregates and derives histogram metrics from the traces based on the value of the `serviceName`. The pipeline then sends the derived metrics to an OpenSearch index named `metrics_for_traces` and to the `trace-to-metrics-anomaly-detector-pipeline` pipeline. + +The `trace-to-metrics-anomaly-detector-pipeline` pipeline receives the aggregated histogram metrics from the `trace-to-metrics-pipeline` and sends them to the `anomaly_detector` processor to detect anomalies by using the Random Cut Forest algorithm. If the algorithm detects any anomalies, it sends them to an OpenSearch index named `trace-metric-anomalies`. + +```json +entry-pipeline: + source: + otel_trace_source: + # Provide the path for ingestion. ${pipelineName} will be replaced with pipeline name configured for this pipeline. + # In this case it would be "/entry-pipeline/v1/traces". This will be endpoint URI path in OpenTelemetry Exporter + # configuration. + # path: "/${pipelineName}/v1/traces" + processor: + - trace_peer_forwarder: + sink: + - pipeline: + name: "span-pipeline" + - pipeline: + name: "service-map-pipeline" + - pipeline: + name: "trace-to-metrics-pipeline" + +span-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - otel_trace_raw: + sink: + - opensearch: + ... + index_type: "trace-analytics-raw" + +service-map-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - service_map: + sink: + - opensearch: + ... + index_type: "trace-analytics-service-map" + +trace-to-metrics-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - aggregate: + # Pick the required identification keys + identification_keys: ["serviceName"] + action: + histogram: + # Pick the appropriate values for each the following fields + key: "durationInNanos" + record_minmax: true + units: "seconds" + buckets: [0, 10000000, 50000000, 100000000] + # Pick the required aggregation period + group_duration: "30s" + sink: + - opensearch: + ... + index: "metrics_for_traces" + - pipeline: + name: "trace-to-metrics-anomaly-detector-pipeline" + +trace-to-metrics-anomaly-detector-pipeline: + source: + pipeline: + name: "trace-to-metrics-pipeline" + processor: + - anomaly_detector: + # Below Key will find anomalies in the max value of histogram generated for durationInNanos. + keys: [ "max" ] + mode: + random_cut_forest: + sink: + - opensearch: + ... + index: "trace-metric-anomalies" +``` +{% include copy-curl.html %} + +## OpenTelemetry metrics + +You can create a pipeline that receives OpenTelemetry metrics and detects anomalies in those metrics. In this example, `entry-pipeline` receives metrics from the OpenTelemetry Collector. If a metric is of type `GAUGE` and the name of the metric is `totalApiBytesSent`, the processor sends it to the `ad-pipeline` pipeline. + +The `ad-pipeline` pipeline receives the metrics from the entry pipeline and performs anomaly detection on the metric values by using the [`anomaly_detector` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/anomaly-detector/). + +```json +entry-pipeline: + source: + otel_metrics_source: + processor: + - otel_metrics: + route: + - gauge_route: '/kind = "GAUGE" and /name = "totalApiBytesSent"' + sink: + - pipeline: + name: "ad-pipeline" + routes: + - gauge_route + - opensearch: + ... + index: "otel-metrics" + +ad-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - anomaly_detector: + # Use "value" as the key on which anomaly detector needs to be run + keys: [ "value" ] + mode: + random_cut_forest: + sink: + - opensearch: + ... + index: otel-metrics-anomalies +``` +{% include copy-curl.html %} diff --git a/_data-prepper/common-use-cases/codec-processor-combinations.md b/_data-prepper/common-use-cases/codec-processor-combinations.md new file mode 100644 index 00000000..57185f2c --- /dev/null +++ b/_data-prepper/common-use-cases/codec-processor-combinations.md @@ -0,0 +1,47 @@ +--- +layout: default +title: Codec processor combinations +parent: Common use cases +nav_order: 10 +--- + +# Codec processor combinations + +At ingestion time, data received by the [`s3` source]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3/) can be parsed by [codecs]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3#codec). Codecs compresses and decompresses large data sets in a certain format before ingestion them through a Data Prepper pipeline [processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/processors/). + +While most codecs can be used with most processors, the following codec processor combinations can make your pipeline more efficient when used with the following input types. + +## JSON array + +A [JSON array](https://json-schema.org/understanding-json-schema/reference/array) is used to order elements of different types. Because an array is required in JSON, the data contained within the array must be tabular. + +The JSON array does not require a processor. + +## NDJSON + +Unlike a JSON array, [NDJSON](https://www.npmjs.com/package/ndjson) allows for each row of data to be delimited by a newline, meaning data is processed per line instead of an array. + +The NDJSON input type is parsed using the [newline]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3#newline-codec) codec, which parses each single line as a single log event. The [parse_json]({{site.url}}{{site.baseurl}}data-prepper/pipelines/configuration/processors/parse-json/) processor then outputs each line as a single event. + +## CSV + +The CSV data type inputs data as a table. It can used without a codec or processor, but it does require one or the other, for example, either just the `csv` processor or the `csv` codec. + +The CSV input type is most effective when used with the following codec processor combinations. + +### `csv` codec + +When the [`csv` codec]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3#csv-codec) is used without a processor, it automatically detects headers from the CSV and uses them for index mapping. + +### `newline` codec + +The [`newline` codec]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3#newline-codec) parses each row as a single log event. The codec will only detect a header when `header_destination` is configured. The [csv]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/csv/) processor then outputs the event into columns. The header detected in `header_destination` from the `newline` codec can be used in the `csv` processor under `column_names_source_key.` + +## Parquet + +[Apache Parquet](https://parquet.apache.org/docs/overview/) is a columnar storage format built for Hadoop. It is most efficient without the use of a codec. Positive results, however, can be achieved when it's configured with [S3 Select]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3#using-s3_select-with-the-s3-source). + +## Avro + +[Apache Avro] helps streamline streaming data pipelines. It is most efficient when used with the [`avro` codec]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sinks/s3#avro-codec) inside an `s3` sink. + diff --git a/_data-prepper/common-use-cases/common-use-cases.md b/_data-prepper/common-use-cases/common-use-cases.md new file mode 100644 index 00000000..342a8fc8 --- /dev/null +++ b/_data-prepper/common-use-cases/common-use-cases.md @@ -0,0 +1,12 @@ +--- +layout: default +title: Common use cases +has_children: true +nav_order: 15 +redirect_from: + - /data-prepper/common-use-cases/ +--- + +# Common use cases + +You can use Data Prepper for several different purposes, including trace analytics, log analytics, Amazon S3 log analytics, and metrics ingestion. \ No newline at end of file diff --git a/_data-prepper/common-use-cases/event-aggregation.md b/_data-prepper/common-use-cases/event-aggregation.md new file mode 100644 index 00000000..f6e2757d --- /dev/null +++ b/_data-prepper/common-use-cases/event-aggregation.md @@ -0,0 +1,135 @@ +--- +layout: default +title: Event aggregation +parent: Common use cases +nav_order: 25 +--- + +# Event aggregation + +You can use Data Prepper to aggregate data from different events over a period of time. Aggregating events can help to reduce unnecessary log volume and manage use cases like multiline logs that are received as separate events. The [`aggregate` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) is a stateful processor that groups events based on the values for a set of specified identification keys and performs a configurable action on each group. + +The `aggregate` processor state is stored in memory. For example, in order to combine four events into one, the processor needs to retain pieces of the first three events. The state of an aggregate group of events is kept for a configurable amount of time. Depending on your logs, the aggregate action being used, and the number of memory options in the processor configuration, the aggregation could take place over a long period of time. + +## Basic usage + +The following example pipeline extracts the fields `sourceIp`, `destinationIp`, and `port` using the [`grok` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/grok/) and then aggregates on those fields over a period of 30 seconds using the [`aggregate` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) and the `put_all` action. At the end of the 30-second period, the aggregated log is sent to the OpenSearch sink. + +```json +aggregate_pipeline: + source: + http: + path: "/${pipelineName}/logs" + processor: + - grok: + match: + log: ["%{IPORHOST:sourceIp} %{IPORHOST:destinationIp} %{NUMBER:port:int}"] + - aggregate: + group_duration: "30s" + identification_keys: ["sourceIp", "destinationIp", "port"] + action: + put_all: + sink: + - opensearch: + ... + index: aggregated_logs +``` +{% include copy-curl.html %} + +For example, consider the following batch of logs: + +```json +{ "log": "127.0.0.1 192.168.0.1 80", "status": 200 } +{ "log": "127.0.0.1 192.168.0.1 80", "bytes": 1000 } +{ "log": "127.0.0.1 192.168.0.1 80" "http_verb": "GET" } +``` +{% include copy-curl.html %} + +The `grok` processor will extract keys such that the log events will look like the following example. These events now have the data that the `aggregate` processor will need for the `identification_keys`. + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "port": 80, "status": 200 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "port": 80, "bytes": 1000 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "port": 80, "http_verb": "GET" } +``` +{% include copy-curl.html %} + +After 30 seconds, the `aggregate` processor writes the following aggregated log to the sink: + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "port": 80, "status": 200, "bytes": 1000, "http_verb": "GET" } +``` +{% include copy-curl.html %} + +## Removing duplicates + +You can remove duplicate entries by deriving keys from incoming events and specifying the `remove_duplicates` option for the `aggregate` processor. This action immediately processes the first event for a group and drops all following events in that group. + +In the following example, the first event is processed with the identification keys `sourceIp` and `destinationIp`: + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200 } +``` +{% include copy-curl.html %} + +The pipeline will then drop the following event because it has the same keys: + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 1000 } +``` +{% include copy-curl.html %} + +The pipeline processes this event and creates a new group because the `sourceIp` is different: + +```json +{ "sourceIp": "127.0.0.2", "destinationIp": "192.168.0.1", "bytes": 1000 } +``` +{% include copy-curl.html %} + +## Log aggregation and conditional routing + +You can use multiple plugins to combine log aggregation with conditional routing. In this example, the pipeline `log-aggregate-pipeline` receives logs by using an HTTP client, like FluentBit, and extracts important values from the logs by matching the value in the `log` key against the [Apache Common Log Format](https://httpd.apache.org/docs/2.4/logs.html). + +Two of the values that the pipeline extracts from the logs with a Grok pattern include `response` and `clientip`. The `aggregate` processor then uses the `clientip` value, along with the `remove_duplicates` option, to drop any logs that contain a `clientip` that has already been processed within the given `group_duration`. + +Three routes, or conditional statements, exist in the pipeline. These routes separate the value of the response into `2xx`, `3xx`, `4xx`, and `5xx` responses. Logs with a `2xx` or `3xx` status are sent to the `aggregated_2xx_3xx` index, logs with a `4xx` status are sent to the `aggregated_4xx index`, and logs with a `5xx` status are sent to the `aggregated_5xx` index. + +```json +log-aggregate-pipeline: + source: + http: + # Provide the path for ingestion. ${pipelineName} will be replaced with pipeline name configured for this pipeline. + # In this case it would be "/log-aggregate-pipeline/logs". This will be the FluentBit output URI value. + path: "/${pipelineName}/logs" + processor: + - grok: + match: + log: [ "%{COMMONAPACHELOG_DATATYPED}" ] + - aggregate: + identification_keys: ["clientip"] + action: + remove_duplicates: + group_duration: "180s" + route: + - 2xx_status: "/response >= 200 and /response < 300" + - 3xx_status: "/response >= 300 and /response < 400" + - 4xx_status: "/response >= 400 and /response < 500" + - 5xx_status: "/response >= 500 and /response < 600" + sink: + - opensearch: + ... + index: "aggregated_2xx_3xx" + routes: + - 2xx_status + - 3xx_status + - opensearch: + ... + index: "aggregated_4xx" + routes: + - 4xx_status + - opensearch: + ... + index: "aggregated_5xx" + routes: + - 5xx_status +``` diff --git a/_data-prepper/common-use-cases/log-analytics.md b/_data-prepper/common-use-cases/log-analytics.md new file mode 100644 index 00000000..30a021b1 --- /dev/null +++ b/_data-prepper/common-use-cases/log-analytics.md @@ -0,0 +1,152 @@ +--- +layout: default +title: Log analytics +parent: Common use cases +nav_order: 30 +--- + +# Log analytics + +Data Prepper is an extendable, configurable, and scalable solution for log ingestion into OpenSearch and Amazon OpenSearch Service. Data Prepper supports receiving logs from [Fluent Bit](https://fluentbit.io/) through the [HTTP Source](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/http-source/README.md) and processing those logs with a [Grok Processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/grok-processor/README.md) before ingesting them into OpenSearch through the [OpenSearch sink](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/opensearch/README.md). + +The following image shows all of the components used for log analytics with Fluent Bit, Data Prepper, and OpenSearch. + +![Log analytics component]({{site.url}}{{site.baseurl}}/images/data-prepper/log-analytics/log-analytics-components.jpg) + +In the application environment, run Fluent Bit. Fluent Bit can be containerized through Kubernetes, Docker, or Amazon Elastic Container Service (Amazon ECS). You can also run Fluent Bit as an agent on Amazon Elastic Compute Cloud (Amazon EC2). Configure the [Fluent Bit http output plugin](https://docs.fluentbit.io/manual/pipeline/outputs/http) to export log data to Data Prepper. Then deploy Data Prepper as an intermediate component and configure it to send the enriched log data to your OpenSearch cluster. From there, use OpenSearch Dashboards to perform more intensive visualization and analysis. + +## Log analytics pipeline + +Log analytics pipelines in Data Prepper are extremely customizable. The following image shows a simple pipeline. + +![Log analytics component]({{site.url}}{{site.baseurl}}/images/data-prepper/log-analytics/log-ingestion-pipeline.jpg) + +### HTTP source + +The [HTTP Source](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/http-source/README.md) accepts log data from Fluent Bit. This source accepts log data in a JSON array format and supports industry-standard encryption in the form of TLS/HTTPS and HTTP basic authentication. + +### Processor + +Data Prepper 1.2 and above come with a [Grok Processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/grok-processor/README.md). The Grok Processor is an invaluable tool for structuring and extracting important fields from your logs, making them more queryable. + +The Grok Processor comes with a wide variety of [default patterns](https://github.com/thekrakken/java-grok/blob/master/src/main/resources/patterns/patterns) that match common log formats like Apache logs or syslogs, but it can easily accept any custom patterns that cater to your specific log format. + +For more information about Grok features, see the documentation. + +### Sink + +There is a generic sink that writes data to OpenSearch as the destination. The [OpenSearch sink](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/opensearch/README.md) has configuration options related to an OpenSearch cluster, like endpoint, SSL/username, index name, index template, and index state management. + +## Pipeline configuration + +The following sections discuss pipeline configuration. + +### Example pipeline with SSL and basic authentication enabled + +This example pipeline configuration comes with SSL and basic authentication enabled for the `http-source`: + +```yaml +log-pipeline: + source: + http: + ssl_certificate_file: "/full/path/to/certfile.crt" + ssl_key_file: "/full/path/to/keyfile.key" + authentication: + http_basic: + username: "myuser" + password: "mys3cret" + processor: + - grok: + match: + # This will match logs with a "log" key against the COMMONAPACHELOG pattern (ex: { "log": "actual apache log..." } ) + # You should change this to match what your logs look like. See the grok documenation to get started. + log: [ "%{COMMONAPACHELOG}" ] + sink: + - opensearch: + hosts: [ "https://localhost:9200" ] + # Change to your credentials + username: "admin" + password: "admin" + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + #cert: /path/to/cert + # If you are connecting to an Amazon OpenSearch Service domain without + # Fine-Grained Access Control, enable these settings. Comment out the + # username and password above. + #aws_sigv4: true + #aws_region: us-east-1 + # Since we are grok matching for apache logs, it makes sense to send them to an OpenSearch index named apache_logs. + # You should change this to correspond with how your OpenSearch indexes are set up. + index: apache_logs +``` + +This pipeline configuration is an example of Apache log ingestion. Don't forget that you can easily configure the Grok Processor for your own custom logs. You will need to modify the configuration for your OpenSearch cluster. + +The following are the main changes you need to make: + +* `hosts` – Set to your hosts. +* `index` – Change this to the OpenSearch index to which you want to send logs. +* `username` – Provide your OpenSearch username. +* `password` – Provide your OpenSearch password. +* `aws_sigv4` – If you use Amazon OpenSearch Service with AWS signing, set this to true. It will sign requests with the default AWS credentials provider. +* `aws_region` – If you use Amazon OpenSearch Service with AWS signing, set this value to the AWS Region in which your cluster is hosted. + +## Fluent Bit + +You will need to run Fluent Bit in your service environment. See [Getting Started with Fluent Bit](https://docs.fluentbit.io/manual/installation/getting-started-with-fluent-bit) for installation instructions. Ensure that you can configure the [Fluent Bit http output plugin](https://docs.fluentbit.io/manual/pipeline/outputs/http) to your Data Prepper HTTP source. The following is an example `fluent-bit.conf` that tails a log file named `test.log` and forwards it to a locally running Data Prepper HTTP source, which runs by default on port 2021. + +Note that you should adjust the file `path`, output `Host`, and `Port` according to how and where you have Fluent Bit and Data Prepper running. + +### Example: Fluent Bit file without SSL and basic authentication enabled + +The following is an example `fluent-bit.conf` file without SSL and basic authentication enabled on the HTTP source: + +``` +[INPUT] + name tail + refresh_interval 5 + path test.log + read_from_head true + +[OUTPUT] + Name http + Match * + Host localhost + Port 2021 + URI /log/ingest + Format json +``` + +If your HTTP source has SSL and basic authentication enabled, you will need to add the details of `http_User`, `http_Passwd`, `tls.crt_file`, and `tls.key_file` to the `fluent-bit.conf` file, as shown in the following example. + +### Example: Fluent Bit file with SSL and basic authentication enabled + +The following is an example `fluent-bit.conf` file with SSL and basic authentication enabled on the HTTP source: + +``` +[INPUT] + name tail + refresh_interval 5 + path test.log + read_from_head true + +[OUTPUT] + Name http + Match * + Host localhost + http_User myuser + http_Passwd mys3cret + tls On + tls.crt_file /full/path/to/certfile.crt + tls.key_file /full/path/to/keyfile.key + Port 2021 + URI /log/ingest + Format json +``` + +# Next steps + +See the [Data Prepper Log Ingestion Demo Guide](https://github.com/opensearch-project/data-prepper/blob/main/examples/log-ingestion/README.md) for a specific example of Apache log ingestion from `FluentBit -> Data Prepper -> OpenSearch` running through Docker. + +In the future, Data Prepper will offer additional sources and processors that will make more complex log analytics pipelines available. Check out the [Data Prepper Project Roadmap](https://github.com/opensearch-project/data-prepper/projects/1) to see what is coming. + +If there is a specific source, processor, or sink that you would like to include in your log analytics workflow and is not currently on the roadmap, please bring it to our attention by creating a GitHub issue. Additionally, if you are interested in contributing to Data Prepper, see our [Contributing Guidelines](https://github.com/opensearch-project/data-prepper/blob/main/CONTRIBUTING.md) as well as our [developer guide](https://github.com/opensearch-project/data-prepper/blob/main/docs/developer_guide.md) and [plugin development guide](https://github.com/opensearch-project/data-prepper/blob/main/docs/plugin_development.md). diff --git a/_data-prepper/common-use-cases/log-enrichment.md b/_data-prepper/common-use-cases/log-enrichment.md new file mode 100644 index 00000000..b4004251 --- /dev/null +++ b/_data-prepper/common-use-cases/log-enrichment.md @@ -0,0 +1,396 @@ +--- +layout: default +title: Log enrichment +parent: Common use cases +nav_order: 35 +--- + +# Log enrichment + +You can perform different types of log enrichment with Data Prepper, including: + +- Filtering. +- Extracting key-value pairs from strings. +- Mutating events. +- Mutating strings. +- Converting lists to maps. +- Processing incoming timestamps. + +## Filtering + +Use the [`drop_events`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/drop-events/) processor to filter out specific log events before sending them to a sink. For example, if you're collecting web request logs and only want to store unsuccessful requests, you can create the following pipeline, which drops any requests for which the response is less than 400 so that only log events with HTTP status codes of 400 and higher remain. + +```json +log-pipeline: + source: + ... + processor: + - grok: + match: + log: [ "%{COMMONAPACHELOG_DATATYPED}" ] + - drop_events: + drop_when: "/response < 400" + sink: + - opensearch: + ... + index: failure_logs +``` +{% include copy-curl.html %} + +The `drop_when` option specifies which events to drop from the pipeline. + +## Extracting key-value pairs from strings + +Log data often includes strings of key-value pairs. For example, if a user queries a URL that can be paginated, the HTTP logs might contain the following HTTP query string: + +```json +page=3&q=my-search-term +``` +{% include copy-curl.html %} + +To perform analysis using the search terms, you can extract the value of `q` from a query string. The [`key_value`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/key-value/) processor provides robust support for extracting keys and values from strings. + +The following example combines the `split_string` and `key_value` processors to extract query parameters from an Apache log line: + +```json +pipeline + ... + processor: + - grok: + match: + message: [ "%{COMMONAPACHELOG_DATATYPED}" ] + - split_string: + entries: + - source: request + delimiter: "?" + - key_value: + source: "/request/1" + field_split_characters: "&" + value_split_characters: "=" + destination: query_params +``` +{% include copy-curl.html %} + +## Mutating events + +The different [mutate event]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/mutate-event/) processors let you rename, copy, add, and delete event entries. + +In this example, the first processor sets the value of the `debug` key to `true` if the key already exists in the event. The second processor only sets the `debug` key to `true` if the key doesn't exist in the event because `overwrite_if_key_exists` is set to `true`. + +```json +... +processor: + - add_entries: + entries: + - key: "debug" + value: true +... +processor: + - add_entries: + entries: + - key: "debug" + value: true + overwrite_if_key_exists: true +... +``` +{% include copy-curl.html %} + +You can also use a format string to construct new entries from existing events. For example, `${date}-${time}` will create a new entry based on the values of the existing entries `date` and `time`. + +For example, the following pipeline adds new event entries dynamically from existing events: + +```json +processor: + - add_entries: + entries: + - key: "key_three" + format: "${key_one}-${key_two} +``` +{% include copy-curl.html %} + +Consider the following incoming event: + +```json +{ + "key_one": "value_one", + "key_two": "value_two" +} +``` +{% include copy-curl.html %} + +The processor transforms it into an event with a new key named `key_three`, which combines values of other keys in the original event, as shown in the following example: + +```json +{ + "key_one": "value_one", + "key_two": "value_two", + "key_three": "value_one-value_two" +} +``` +{% include copy-curl.html %} + +## Mutating strings + +The various [mutate string]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/mutate-string/) processors offer tools that you can use to manipulate strings in incoming data. For example, if you need to split a string into an array, you can use the `split_string` processor: + +```json +... +processor: + - split_string: + entries: + - source: "message" + delimiter: "&" +... +``` +{% include copy-curl.html %} + +The processor will transform a string such as `a&b&c` into `["a", "b", "c"]`. + +## Converting lists to maps + +The [`list_to_map`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/list-to-map/) processor, which is one of the mutate event processors, converts a list of objects in an event to a map. + +For example, consider the following processor configuration: + +```json +... +processor: + - list_to_map: + key: "name" + source: "A-car-as-list" + target: "A-car-as-map" + value_key: "value" + flatten: true +... +``` +{% include copy-curl.html %} + +The following processor will convert an event that contains a list of objects to a map like this: + +```json +{ + "A-car-as-list": [ + { + "name": "make", + "value": "tesla" + }, + { + "name": "model", + "value": "model 3" + }, + { + "name": "color", + "value": "white" + } + ] +} +``` +{% include copy-curl.html %} + +```json +{ + "A-car-as-map": { + "make": "tesla", + "model": "model 3", + "color": "white" + } +} +``` +{% include copy-curl.html %} + +As another example, consider an incoming event with the following structure: + +```json +{ + "mylist" : [ + { + "somekey" : "a", + "somevalue" : "val-a1", + "anothervalue" : "val-a2" + }, + { + "somekey" : "b", + "somevalue" : "val-b1", + "anothervalue" : "val-b2" + }, + { + "somekey" : "b", + "somevalue" : "val-b3", + "anothervalue" : "val-b4" + }, + { + "somekey" : "c", + "somevalue" : "val-c1", + "anothervalue" : "val-c2" + } + ] +} +``` +{% include copy-curl.html %} + +You can define the following options in the processor configuration: + +```json +... +processor: + - list_to_map: + key: "somekey" + source: "mylist" + target: "myobject" + value_key: "value" + flatten: true +... +``` +{% include copy-curl.html %} + +The processor modifies the event by removing `mylist` and adding the new `myobject` object: + +```json +{ + "myobject" : { + "a" : [ + { + "somekey" : "a", + "somevalue" : "val-a1", + "anothervalue" : "val-a2" + } + ], + "b" : [ + { + "somekey" : "b", + "somevalue" : "val-b1", + "anothervalue" : "val-b2" + }, + { + "somekey" : "b", + "somevalue" : "val-b3", + "anothervalue" : "val-b4" + } + "c" : [ + { + "somekey" : "c", + "somevalue" : "val-c1", + "anothervalue" : "val-c2" + } + ] + } +} +``` +{% include copy-curl.html %} + +In many cases, you may want to flatten the array for each key. In these situations, you must choose only one object to retain. The processor offers a choice of either first or last. For example, consider the following: + +```json +... +processor: + - list_to_map: + key: "somekey" + source: "mylist" + target: "myobject" + flatten: true +... +``` +{% include copy-curl.html %} + +The incoming event structure is then flattened accordingly: + +```json +{ + "myobject" : { + "a" : { + "somekey" : "a", + "somevalue" : "val-a1", + "anothervalue" : "val-a2" + }, + "b" : { + "somekey" : "b", + "somevalue" : "val-b1", + "anothervalue" : "val-b2" + } + "c" : { + "somekey" : "c", + "somevalue" : "val-c1", + "anothervalue" : "val-c2" + } + } +} +``` +{% include copy-curl.html %} + +## Processing incoming timestamps + +The [`date`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/date/) processor parses the `timestamp` key from incoming events by converting it to International Organization for Standardization (ISO) 8601 format: + +```json +... + processor: + - date: + match: + - key: timestamp + patterns: ["dd/MMM/yyyy:HH:mm:ss"] + destination: "@timestamp" + source_timezone: "America/Los_Angeles" + destination_timezone: "America/Chicago" + locale: "en_US" +... +``` +{% include copy-curl.html %} + +If the preceding pipeline processes the following event: + +```json +{"timestamp": "10/Feb/2000:13:55:36"} +``` +{% include copy-curl.html %} + +It converts the event to the following format: + +```json +{ + "timestamp":"10/Feb/2000:13:55:36", + "@timestamp":"2000-02-10T15:55:36.000-06:00" +} +``` +{% include copy-curl.html %} + +### Generating timestamps + +The `date` processor can generate timestamps for incoming events if you specify `@timestamp` for the `destination` option: + +```json +... + processor: + - date: + from_time_received: true + destination: "@timestamp" +... +``` +{% include copy-curl.html %} + +### Deriving punctuation patterns + +The [`substitute_string`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/substitute-string/) processor (which is one of the mutate string processors) lets you derive a punctuation pattern from incoming events. In the following example pipeline, the processor will scan incoming Apache log events and derive punctuation patterns from them: + +```json +processor: + - substitute_string: + entries: + - source: "message" + from: "[a-zA-Z0-9_]+" + to:"" + - source: "message" + from: "[ ]+" + to: "_" +``` +{% include copy-curl.html %} + +The following incoming Apache HTTP log will generate a punctuation pattern: + +```json +[{"message":"10.10.10.11 - admin [19/Feb/2015:15:50:36 -0500] \"GET /big2.pdf HTTP/1.1\" 200 33973115 0.202 \"-\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36\""}] + +{"message":"..._-_[//:::_-]_\"_/._/.\"_._\"-\"_\"/._(;_)_/._(,_)_/..._/.\""} +``` +{% include copy-curl.html %} + +You can count these generated patterns by passing them through the [`aggregate`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) processor with the `count` action. diff --git a/_data-prepper/common-use-cases/metrics-traces.md b/_data-prepper/common-use-cases/metrics-traces.md new file mode 100644 index 00000000..c15eaa09 --- /dev/null +++ b/_data-prepper/common-use-cases/metrics-traces.md @@ -0,0 +1,51 @@ +--- +layout: default +title: Deriving metrics from traces +parent: Common use cases +nav_order: 20 +--- + +# Deriving metrics from traces + +You can use Data Prepper to derive metrics from OpenTelemetry traces. The following example pipeline receives incoming traces and extracts a metric called `durationInNanos`, aggregated over a tumbling window of 30 seconds. It then derives a histogram from the incoming traces. + +The pipeline contains the following pipelines: + +- `entry-pipeline` – Receives trace data from the OpenTelemetry collector and forwards it to the `trace_to_metrics_pipeline` pipeline. + +- `trace-to-metrics-pipeline` - Receives the trace data from the `entry-pipeline` pipeline, aggregates it, and derives a histogram of `durationInNanos` from the traces based on the value of the `serviceName` field. It then sends the derived metrics to the OpenSearch index called `metrics_for_traces`. + +```json +entry-pipeline: + source: + otel_trace_source: + # Provide the path for ingestion. ${pipelineName} will be replaced with pipeline name. + # In this case it would be "/entry-pipeline/v1/traces". This will be endpoint URI path in OpenTelemetry Exporter configuration. + path: "/${pipelineName}/v1/traces" + sink: + - pipeline: + name: "trace-to-metrics-pipeline" + +trace-to-metrics-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - aggregate: + # Pick the required identification keys + identification_keys: ["serviceName"] + action: + histogram: + # Pick the appropriate values for each of the following fields + key: "durationInNanos" + record_minmax: true + units: "seconds" + buckets: [0, 10000000, 50000000, 100000000] + # Specify an aggregation period + group_duration: "30s" + sink: + - opensearch: + ... + index: "metrics_for_traces" +``` +{% include copy-curl.html %} diff --git a/_data-prepper/common-use-cases/s3-logs.md b/_data-prepper/common-use-cases/s3-logs.md new file mode 100644 index 00000000..7986a7ee --- /dev/null +++ b/_data-prepper/common-use-cases/s3-logs.md @@ -0,0 +1,180 @@ +--- +layout: default +title: S3 logs +parent: Common use cases +nav_order: 40 +--- + +# S3 logs + +Data Prepper allows you to load logs from [Amazon Simple Storage Service](https://aws.amazon.com/s3/) (Amazon S3), including traditional logs, JSON documents, and CSV logs. + + +## Architecture + +Data Prepper can read objects from S3 buckets using an [Amazon Simple Queue Service (SQS)](https://aws.amazon.com/sqs/) (Amazon SQS) queue and [Amazon S3 Event Notifications](https://docs.aws.amazon.com/AmazonS3/latest/userguide/NotificationHowTo.html). + +Data Prepper polls the Amazon SQS queue for S3 event notifications. When Data Prepper receives a notification that an S3 object was created, Data Prepper reads and parses that S3 object. + +The following diagram shows the overall architecture of the components involved. + +S3 source architecture{: .img-fluid} + +The flow of data is as follows. + +1. A system produces logs into the S3 bucket. +2. S3 creates an S3 event notification in the SQS queue. +3. Data Prepper polls Amazon SQS for messages and then receives a message. +4. Data Prepper downloads the content from the S3 object. +5. Data Prepper sends a document to OpenSearch for the content in the S3 object. + + +## Pipeline overview + +Data Prepper supports reading data from S3 using the [`s3` source]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3/). + +The following diagram shows a conceptual outline of a Data Prepper pipeline reading from S3. + +S3 source architecture{: .img-fluid} + +## Prerequisites + +Before Data Prepper can read log data from S3, you need the following prerequisites: + +- An S3 bucket. +- A log producer that writes logs to S3. The exact log producer will vary depending on your specific use case, but could include writing logs to S3 or a service such as Amazon CloudWatch. + + +## Getting started + +Use the following steps to begin loading logs from S3 with Data Prepper. + +1. Create an [SQS standard queue](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/step-create-queue.html) for your S3 event notifications. +2. Configure [bucket notifications](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ways-to-add-notification-config-to-bucket.html) for SQS. Use the `s3:ObjectCreated:*` event type. +3. Grant [AWS IAM](https://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) permissions to Data Prepper for accessing SQS and S3. +4. (Recommended) Create an [SQS dead-letter queue](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-dead-letter-queues.html) (DLQ). +5. (Recommended) Configure an SQS re-drive policy to move failed messages into the DLQ. + +### Setting permissions for Data Prepper + +To view S3 logs, Data Prepper needs access to Amazon SQS and S3. +Use the following example to set up permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "s3-access", + "Effect": "Allow", + "Action": "s3:GetObject", + "Resource": "arn:aws:s3:::/*" + }, + { + "Sid": "sqs-access", + "Effect": "Allow", + "Action": [ + "sqs:DeleteMessage", + "sqs:ReceiveMessage" + ], + "Resource": "arn:aws:sqs::<123456789012>:" + }, + { + "Sid": "kms-access", + "Effect": "Allow", + "Action": "kms:Decrypt", + "Resource": "arn:aws:kms::<123456789012>:key/" + } + ] +} +``` + +If your S3 objects or SQS queues do not use KMS, you can remove the `kms:Decrypt` permission. + +### SQS dead-letter queue + +The are two options for how to handle errors resulting from processing S3 objects. + +- Use an SQS dead-letter queue (DLQ) to track the failure. This is the recommended approach. +- Delete the message from SQS. You must manually find the S3 object and correct the error. + +The following diagram shows the system architecture when using SQS with DLQ. + +S3 source architecture with dlq{: .img-fluid} + +To use an SQS dead-letter queue, perform the following steps: + +1. Create a new SQS standard queue to act as your DLQ. +2. Configure your SQS's redrive policy [to use your DLQ](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-configure-dead-letter-queue.html). Consider using a low value such as 2 or 3 for the "Maximum Receives" setting. +3. Configure the Data Prepper `s3` source to use `retain_messages` for `on_error`. This is the default behavior. + +## Pipeline design + +Create a pipeline to read logs from S3, starting with an [`s3`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3/) source plugin. Use the following example for guidance. + +```yaml +s3-log-pipeline: + source: + s3: + notification_type: sqs + compression: gzip + codec: + newline: + sqs: + # Change this value to your SQS Queue URL + queue_url: "arn:aws:sqs::<123456789012>:" + visibility_timeout: "2m" +``` + +Configure the following options according to your use case: + +* `queue_url`: This the SQS queue URL and is always unique to your pipeline. +* `codec`: The codec determines how to parse the incoming data. +* `visibility_timeout`: Configure this value to be large enough for Data Prepper to process 10 S3 objects. However, if you make this value too large, messages that fail to process will take at least as long as the specified value before Data Prepper retries. + +The default values for each option work for the majority of use cases. For all available options for the S3 source, see [`s3`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3/). + +```yaml +s3-log-pipeline: + source: + s3: + notification_type: sqs + compression: gzip + codec: + newline: + sqs: + # Change this value to your SQS Queue URL + queue_url: "arn:aws:sqs::<123456789012>:" + visibility_timeout: "2m" + aws: + # Specify the correct region + region: "" + # This shows using an STS role, but you can also use your system's default permissions. + sts_role_arn: "arn:aws:iam::<123456789012>:role/" + processor: + # You can configure a grok pattern to enrich your documents in OpenSearch. + #- grok: + # match: + # message: [ "%{COMMONAPACHELOG}" ] + sink: + - opensearch: + hosts: [ "https://localhost:9200" ] + # Change to your credentials + username: "admin" + password: "admin" + index: s3_logs +``` + +## Multiple Data Prepper pipelines + +We recommend that you have one SQS queue per Data Prepper pipeline. In addition, you can have multiple nodes in the same cluster reading from the same SQS queue, which doesn't require additional configuration with Data Prepper. + +If you have multiple pipelines, you must create multiple SQS queues for each pipeline, even if both pipelines use the same S3 bucket. + +## Amazon SNS fanout pattern + +To meet the scale of logs produced by S3, some users require multiple SQS queues for their logs. You can use [Amazon Simple Notification Service](https://docs.aws.amazon.com/sns/latest/dg/welcome.html) (Amazon SNS) to route event notifications from S3 to an SQS [fanout pattern](https://docs.aws.amazon.com/sns/latest/dg/sns-common-scenarios.html). Using SNS, all S3 event notifications are sent directly to a single SNS topic, where you can subscribe to multiple SQS queues. + +To make sure that Data Prepper can directly parse the event from the SNS topic, configure [raw message delivery](https://docs.aws.amazon.com/sns/latest/dg/sns-large-payload-raw-message-delivery.html) on the SNS to SQS subscription. Setting this option will not affect other SQS queues that are subscribed to that SNS topic. + + diff --git a/_data-prepper/common-use-cases/sampling.md b/_data-prepper/common-use-cases/sampling.md new file mode 100644 index 00000000..7c77e8c3 --- /dev/null +++ b/_data-prepper/common-use-cases/sampling.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Sampling +parent: Common use cases +nav_order: 45 +--- + +# Sampling + +Data Prepper provides the following sampling capabilities: + +- Time sampling +- Percentage sampling +- Tail sampling + +## Time sampling + +You can use the `rate_limiter` action within the [`aggregate` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) to limit the number of events that can be processed per second. You can choose to either drop excess events or carry them forward to the next time period. + +In the following example, only 100 events with a status code of `200` are sent to the sink per second from a given IP address. The `when_exceeds` option is set to `drop`, which means that all excess events from the configured time window will be dropped. + +```json +... + processor: + - aggregate: + identification_keys: ["clientip"] + action: + rate_limiter: + events_per_second: 100 + when_exceeds: drop + when: "/status == 200" +... +``` + +If you instead set the `when_exceeds` option to `block`, the processor will block the pipeline until the time window has elapsed. Then it will process the blocked events. + +## Percentage sampling + +Use the `percent_sampler` action within the `aggregate` processor to limit the number of events that are sent to a sink. All excess events will be dropped. + +In the following example, only 20% of events with a status code of `200` are sent to the sink from a given IP address: + +```json +... + processor: + - aggregate: + identification_keys: ["clientip"] + duration : + action: + percent_sampler: + percent: 20 + when: "/status == 200" +... +``` + +## Tail sampling + +Use the `tail_sampler` action within the `aggregate` processor to sample events based on a set of defined policies. This action waits for an aggregation to complete across different aggregation periods based on the configured wait period. When an aggregation is complete, and if it matches the specific error condition, it is sent to the sink. Otherwise, only a configured percentage of events is sent to the sink. + +The following pipeline sends all OpenTelemetry traces with an error condition status of `2` to the sink. It only sends 20% of the traces that don't match this error condition to the sink. + +```json +... + processor: + - aggregate: + identification_keys: ["traceId"] + action: + tail_sampler: + percent: 20 + wait_period: "10s" + condition: "/status == 2" + +... +``` + +If you set the error condition to `false` or don't include it, only the configured percentage of events is allowed to pass through, as determined by a probabilistic outcome. + +Because it can be difficult to determine exactly when tail sampling should occur, you can use the `wait_period` option to measure the idle time since the last event was received. diff --git a/_data-prepper/common-use-cases/text-processing.md b/_data-prepper/common-use-cases/text-processing.md new file mode 100644 index 00000000..041ca63a --- /dev/null +++ b/_data-prepper/common-use-cases/text-processing.md @@ -0,0 +1,215 @@ +--- +layout: default +title: Text processing +parent: Common use cases +nav_order: 55 +--- + +# Text processing + +Data Prepper provides text processing capabilities with the [`grok processor`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/grok/). The `grok` processor is based on the [`java-grok`](https://mvnrepository.com/artifact/io.krakens/java-grok) library and supports all compatible patterns. The `java-grok` library is built using the [`java.util.regex`](https://docs.oracle.com/javase/8/docs/api/java/util/regex/package-summary.html) regular expression library. + +You can add custom patterns to your pipelines by using the `patterns_definitions` option. When debugging custom patterns, the [Grok Debugger](https://grokdebugger.com/) can be helpful. + +## Basic usage + +To get started with text processing, create the following pipeline: + +```json +patten-matching-pipeline: + source + ... + processor: + - grok: + match: + message: ['%{IPORHOST:clientip} \[%{HTTPDATE:timestamp}\] %{NUMBER:response_status:int}'] + sink: + - opensearch: + # Provide an OpenSearch cluster endpoint +``` +{% include copy-curl.html %} + +An incoming message might contain the following contents: + +```json +{"message": "127.0.0.1 198.126.12 [10/Oct/2000:13:55:36 -0700] 200"} +``` +{% include copy-curl.html %} + +In each incoming event, the pipeline will locate the value in the `message` key and attempt to match the pattern. The keywords `IPORHOST`, `HTTPDATE`, and `NUMBER` are built into the plugin. + +When an incoming record matches the pattern, it generates an internal event such as the following with identification keys extracted from the original message: + +```json +{ + "message":"127.0.0.1 198.126.12 [10/Oct/2000:13:55:36 -0700] 200", + "response_status":200, + "clientip":"198.126.12", + "timestamp":"10/Oct/2000:13:55:36 -0700" +} +``` +{% include copy-curl.html %} + +The `match` configuration for the `grok` processor specifies which record keys to match against which patterns. + +In the following example, the `match` configuration checks incoming logs for a `message` key. If the key exists, it matches the key value against the `SYSLOGBASE` pattern and then against the `COMMONAPACHELOG` pattern. It then checks the logs for a `timestamp` key. If that key exists, it attempts to match the key value against the `TIMESTAMP_ISO8601` pattern. + +```json +processor: + - grok: + match: + message: ['%{SYSLOGBASE}', "%{COMMONAPACHELOG}"] + timestamp: ["%{TIMESTAMP_ISO8601}"] +``` +{% include copy-curl.html %} + +By default, the plugin continues until it finds a successful match. For example, if there is a successful match against the value in the `message` key for a `SYSLOGBASE` pattern, the plugin doesn't attempt to match the other patterns. If you want to match logs against every pattern, include the `break_on_match` option. + +## Including named and empty captures + +Include the `keep_empty_captures` option in your pipeline configuration to include null captures or the `named_captures_only` option to include only named captures. Named captures follow the pattern `%{SYNTAX:SEMANTIC}` while unnamed captures follow the pattern `%{SYNTAX}`. + +For example, you can modify the preceding Grok configuration to remove `clientip` from the `%{IPORHOST}` pattern: + +```json +processor: + - grok: + match: + message: ['%{IPORHOST} \[%{HTTPDATE:timestamp}\] %{NUMBER:response_status:int}'] +``` +{% include copy-curl.html %} + +The resulting grokked log will look like this: + +```json +{ + "message":"127.0.0.1 198.126.12 [10/Oct/2000:13:55:36 -0700] 200", + "response_status":200, + "timestamp":"10/Oct/2000:13:55:36 -0700" +} +``` +{% include copy-curl.html %} + +Notice that the `clientip` key no longer exists because the `%{IPORHOST}` pattern is now an unnamed capture. + +However, if you set `named_captures_only` to `false`: + +```json +processor: + - grok: + match: + named_captures_only: false + message: ['%{IPORHOST} \[%{HTTPDATE:timestamp}\] %{NUMBER:message:int}'] +``` +{% include copy-curl.html %} + +Then the resulting grokked log will look like this: + +```json +{ + "message":"127.0.0.1 198.126.12 [10/Oct/2000:13:55:36 -0700] 200", + "MONTH":"Oct", + "YEAR":"2000", + "response_status":200, + "HOUR":"13", + "TIME":"13:55:36", + "MINUTE":"55", + "SECOND":"36", + "IPORHOST":"198.126.12", + "MONTHDAY":"10", + "INT":"-0700", + "timestamp":"10/Oct/2000:13:55:36 -0700" +} +``` +{% include copy-curl.html %} + +Note that the `IPORHOST` capture now shows up as a new key, along with some internal unnamed captures like `MONTH` and `YEAR`. The `HTTPDATE` keyword is currently using these patterns, which you can see in the default patterns file. + +## Overwriting keys + +Include the `keys_to_overwrite` option to specify which existing record keys to overwrite if there is a capture with the same key value. + +For example, you can modify the preceding Grok configuration to replace `%{NUMBER:response_status:int}` with `%{NUMBER:message:int}` and add `message` to the list of keys to overwrite: + +```json +processor: + - grok: + match: + keys_to_overwrite: ["message"] + message: ['%{IPORHOST:clientip} \[%{HTTPDATE:timestamp}\] %{NUMBER:message:int}'] +``` +{% include copy-curl.html %} + +In the resulting grokked log, the original message is overwritten with the number `200`: + +```json +{ + "message":200, + "clientip":"198.126.12", + "timestamp":"10/Oct/2000:13:55:36 -0700" +} +``` +{% include copy-curl.html %} + +## Using custom patterns + +Include the `pattern_definitions` option in your Grok configuration to specify custom patterns. + +The following configuration creates custom regex patterns named `CUSTOM_PATTERN-1` and `CUSTOM_PATTERN-2`. By default, the plugin continues until it finds a successful match. + +```json +processor: + - grok: + pattern_definitions: + CUSTOM_PATTERN_1: 'this-is-regex-1' + CUSTOM_PATTERN_2: '%{CUSTOM_PATTERN_1} REGEX' + match: + message: ["%{CUSTOM_PATTERN_2:my_pattern_key}"] +``` +{% include copy-curl.html %} + +If you specify `break_on_match` as `false`, the pipeline attempts to match all patterns and extract keys from the incoming events: + +```json +processor: + - grok: + pattern_definitions: + CUSTOM_PATTERN_1: 'this-is-regex-1' + CUSTOM_PATTERN_2: 'this-is-regex-2' + CUSTOM_PATTERN_3: 'this-is-regex-3' + CUSTOM_PATTERN_4: 'this-is-regex-4' + match: + message: [ "%{PATTERN1}”, "%{PATTERN2}" ] + log: [ "%{PATTERN3}", "%{PATTERN4}" ] + break_on_match: false +``` +{% include copy-curl.html %} + +You can define your own custom patterns to use for pipeline pattern matching. In the previous example, `my_pattern` will be extracted after matching the custom patterns. + +## Storing captures with a parent key + +Include the `target_key` option in your Grok configuration to wrap all record captures in an additional outer key value. + +For example, you can modify the preceding Grok configuration to add a target key named `grokked`: + +```json +processor: + - grok: + target_key: "grokked" + match: + message: ['%{IPORHOST} \[%{HTTPDATE:timestamp}\] %{NUMBER:response_status:int}'] +``` + +The resulting grokked log will look like this: + +```json +{ + "message":"127.0.0.1 198.126.12 [10/Oct/2000:13:55:36 -0700] 200", + "grokked": { + "response_status":200, + "clientip":"198.126.12", + "timestamp":"10/Oct/2000:13:55:36 -0700" + } +} +``` diff --git a/_data-prepper/common-use-cases/trace-analytics.md b/_data-prepper/common-use-cases/trace-analytics.md new file mode 100644 index 00000000..1f6c3b7c --- /dev/null +++ b/_data-prepper/common-use-cases/trace-analytics.md @@ -0,0 +1,377 @@ +--- +layout: default +title: Trace analytics +parent: Common use cases +nav_order: 60 +--- + +# Trace analytics + +Trace analytics allows you to collect trace data and customize a pipeline that ingests and transforms the data for use in OpenSearch. The following provides an overview of the trace analytics workflow in Data Prepper, how to configure it, and how to visualize trace data. + +## Introduction + +When using Data Prepper as a server-side component to collect trace data, you can customize a Data Prepper pipeline to ingest and transform the data for use in OpenSearch. Upon transformation, you can visualize the transformed trace data for use with the Observability plugin inside of OpenSearch Dashboards. Trace data provides visibility into your application's performance, and helps you gain more information about individual traces. + +The following flowchart illustrates the trace analytics workflow, from running OpenTelemetry Collector to using OpenSearch Dashboards for visualization. + +Trace analytics component overview{: .img-fluid} + +To monitor trace analytics, you need to set up the following components in your service environment: +- Add **instrumentation** to your application so it can generate telemetry data and send it to an OpenTelemetry collector. +- Run an **OpenTelemetry collector** as a sidecar or daemonset for Amazon Elastic Kubernetes Service (Amazon EKS), a sidecar for Amazon Elastic Container Service (Amazon ECS), or an agent on Amazon Elastic Compute Cloud (Amazon EC2). You should configure the collector to export trace data to Data Prepper. +- Deploy **Data Prepper** as the ingestion collector for OpenSearch. Configure it to send the enriched trace data to your OpenSearch cluster or to the Amazon OpenSearch Service domain. +- Use **OpenSearch Dashboards** to visualize and detect problems in your distributed applications. + +## Trace analytics pipeline + +To monitor trace analytics in Data Prepper, we provide three pipelines: `entry-pipeline`, `raw-trace-pipeline`, and `service-map-pipeline`. The following image provides an overview of how the pipelines work together to monitor trace analytics. + +Trace analytics pipeline overview{: .img-fluid} + + +### OpenTelemetry trace source + +The [OpenTelemetry source]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/otel-trace-raw/) accepts trace data from the OpenTelemetry Collector. The source follows the [OpenTelemetry Protocol](https://github.com/open-telemetry/opentelemetry-specification/tree/master/specification/protocol) and officially supports transport over gRPC and the use of industry-standard encryption (TLS/HTTPS). + +### Processor + +There are three processors for the trace analytics feature: + +* *otel_traces_raw* - The *otel_traces_raw* processor receives a collection of [span](https://github.com/opensearch-project/data-prepper/blob/fa65e9efb3f8d6a404a1ab1875f21ce85e5c5a6d/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/trace/Span.java) records from [*otel-trace-source*]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/otel-trace/), and performs stateful processing, extraction, and completion of trace-group-related fields. +* *otel_traces_group* - The *otel_traces_group* processor fills in the missing trace-group-related fields in the collection of [span](https://github.com/opensearch-project/data-prepper/blob/298e7931aa3b26130048ac3bde260e066857df54/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/trace/Span.java) records by looking up the OpenSearch backend. +* *service_map_stateful* – The *service_map_stateful* processor performs the required preprocessing for trace data and builds metadata to display the `service-map` dashboards. + + +### OpenSearch sink + +OpenSearch provides a generic sink that writes data to OpenSearch as the destination. The [OpenSearch sink]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sinks/opensearch/) has configuration options related to the OpenSearch cluster, such as endpoint, SSL, username/password, index name, index template, and index state management. + +The sink provides specific configurations for the trace analytics feature. These configurations allow the sink to use indexes and index templates specific to trace analytics. The following OpenSearch indexes are specific to trace analytics: + +* *otel-v1-apm-span* – The *otel-v1-apm-span* index stores the output from the [otel_traces_raw]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/otel-trace-raw/) processor. +* *otel-v1-apm-service-map* – The *otel-v1-apm-service-map* index stores the output from the [service_map_stateful]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/service-map-stateful/) processor. + +## Trace tuning + +Starting with version 0.8.x, Data Prepper supports both vertical and horizontal scaling for trace analytics. You can adjust the size of a single Data Prepper instance to meet your workload's demands and scale vertically. + +You can scale horizontally by using the core [peer forwarder]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/peer-forwarder/) to deploy multiple Data Prepper instances to form a cluster. This enables Data Prepper instances to communicate with instances in the cluster and is required for horizontally scaling deployments. + +### Scaling recommendations + +Use the following recommended configurations to scale Data Prepper. We recommend that you modify parameters based on the requirements. We also recommend that you monitor the Data Prepper host metrics and OpenSearch metrics to ensure that the configuration works as expected. + +#### Buffer + +The total number of trace requests processed by Data Prepper is equal to the sum of the `buffer_size` values in `otel-trace-pipeline` and `raw-pipeline`. The total number of trace requests sent to OpenSearch is equal to the product of `batch_size` and `workers` in `raw-trace-pipeline`. For more information about `raw-pipeline`, see [Trace analytics pipeline]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines). + + +We recommend the following when making changes to buffer settings: + * The `buffer_size` value in `otel-trace-pipeline` and `raw-pipeline` should be the same. + * The `buffer_size` should be greater than or equal to `workers` * `batch_size` in the `raw-pipeline`. + + +#### Workers + +The `workers` setting determines the number of threads that are used by Data Prepper to process requests from the buffer. We recommend that you set `workers` based on the CPU utilization. This value can be higher than the number of available processors because Data Prepper uses significant input/output time when sending data to OpenSearch. + +#### Heap + +Configure the Data Prepper heap by setting the `JVM_OPTS` environment variable. We recommend that you set the heap value to a minimum value of `4` * `batch_size` * `otel_send_batch_size` * `maximum size of indvidual span`. + +As mentioned in the [OpenTelemetry Collector](#opentelemetry-collector) section, set `otel_send_batch_size` to a value of `50` in your OpenTelemetry Collector configuration. + +#### Local disk + +Data Prepper uses the local disk to store metadata required for service map processing, so we recommend storing only the following key fields: `traceId`, `spanId`, `parentSpanId`, `spanKind`, `spanName`, and `serviceName`. The `service-map` plugin stores only two files, each of which stores `window_duration` seconds of data. As an example, testing with a throughput of `3000 spans/second` resulted in the total disk usage of `4 MB`. + +Data Prepper also uses the local disk to write logs. In the most recent version of Data Prepper, you can redirect the logs to your preferred path. + + +### AWS CloudFormation template and Kubernetes/Amazon EKS configuration files + +The [AWS CloudFormation](https://github.com/opensearch-project/data-prepper/blob/main/deployment-template/ec2/data-prepper-ec2-deployment-cfn.yaml) template provides a user-friendly mechanism for configuring the scaling attributes described in the [Trace tuning](#trace-tuning) section. + +The [Kubernetes configuration files](https://github.com/opensearch-project/data-prepper/blob/main/examples/dev/k8s/README.md) and [Amazon EKS configuration files](https://github.com/opensearch-project/data-prepper/blob/main/deployment-template/eks/README.md) are available for configuring these attributes in a cluster deployment. + +### Benchmark tests + +The benchmark tests were performed on an `r5.xlarge` EC2 instance with the following configuration: + + * `buffer_size`: 4096 + * `batch_size`: 256 + * `workers`: 8 + * `Heap`: 10 GB + +This setup was able to handle a throughput of `2100` spans/second at `20` percent CPU utilization. + +## Pipeline configuration + +The following sections provide examples of different types of pipelines and how to configure each type. + +### Example: Trace analytics pipeline + +The following example demonstrates how to build a pipeline that supports the [OpenSearch Dashboards Observability plugin]({{site.url}}{{site.baseurl}}/observability-plugin/trace/ta-dashboards/). This pipeline takes data from the OpenTelemetry Collector and uses two other pipelines as sinks. These two separate pipelines serve two different purposes and write to different OpenSearch indexes. The first pipeline prepares trace data for OpenSearch and enriches and ingests the span documents into a span index within OpenSearch. The second pipeline aggregates traces into a service map and writes service map documents into a service map index within OpenSearch. + +Starting with Data Prepper version 2.0, Data Prepper no longer supports the `otel_traces_raw_prepper` processor. The `otel_traces_raw` processor replaces the `otel_traces_raw_prepper` processor and supports some of Data Prepper's recent data model changes. Instead, you should use the `otel_traces_raw` processor. See the following YAML file example: + +```yml +entry-pipeline: + delay: "100" + source: + otel_traces_source: + ssl: false + buffer: + bounded_blocking: + buffer_size: 10240 + batch_size: 160 + sink: + - pipeline: + name: "raw-trace-pipeline" + - pipeline: + name: "service-map-pipeline" +raw-pipeline: + source: + pipeline: + name: "entry-pipeline" + buffer: + bounded_blocking: + buffer_size: 10240 + batch_size: 160 + processor: + - otel_traces_raw: + sink: + - opensearch: + hosts: ["https://localhost:9200"] + insecure: true + username: admin + password: admin + index_type: trace-analytics-raw +service-map-pipeline: + delay: "100" + source: + pipeline: + name: "entry-pipeline" + buffer: + bounded_blocking: + buffer_size: 10240 + batch_size: 160 + processor: + - service_map_stateful: + sink: + - opensearch: + hosts: ["https://localhost:9200"] + insecure: true + username: admin + password: admin + index_type: trace-analytics-service-map +``` + +To maintain similar ingestion throughput and latency, scale the `buffer_size` and `batch_size` by the estimated maximum batch size in the client request payload. {: .tip} + +#### Example: `otel trace` + +The following is an example `otel-trace-source` .yaml file with SSL and basic authentication enabled. Note that you will need to modify your `otel-collector-config.yaml` file so that it uses your own credentials. + +```yaml +source: + otel_traces_source: + #record_type: event # Add this when using Data Prepper 1.x. This option is removed in 2.0 + ssl: true + sslKeyCertChainFile: "/full/path/to/certfile.crt" + sslKeyFile: "/full/path/to/keyfile.key" + authentication: + http_basic: + username: "my-user" + password: "my_s3cr3t" +``` + +#### Example: pipeline.yaml + +The following is an example `pipeline.yaml` file without SSL and basic authentication enabled for the `otel-trace-pipeline` pipeline: + +```yaml +otel-trace-pipeline: + # workers is the number of threads processing data in each pipeline. + # We recommend same value for all pipelines. + # default value is 1, set a value based on the machine you are running Data Prepper + workers: 8 + # delay in milliseconds is how often the worker threads should process data. + # Recommend not to change this config as we want the entry-pipeline to process as quick as possible + # default value is 3_000 ms + delay: "100" + source: + otel_traces_source: + #record_type: event # Add this when using Data Prepper 1.x. This option is removed in 2.0 + ssl: false # Change this to enable encryption in transit + authentication: + unauthenticated: + buffer: + bounded_blocking: + # buffer_size is the number of ExportTraceRequest from otel-collector the data prepper should hold in memeory. + # We recommend to keep the same buffer_size for all pipelines. + # Make sure you configure sufficient heap + # default value is 512 + buffer_size: 512 + # This is the maximum number of request each worker thread will process within the delay. + # Default is 8. + # Make sure buffer_size >= workers * batch_size + batch_size: 8 + sink: + - pipeline: + name: "raw-trace-pipeline" + - pipeline: + name: "entry-pipeline" +raw-pipeline: + # Configure same as the otel-trace-pipeline + workers: 8 + # We recommend using the default value for the raw-pipeline. + delay: "3000" + source: + pipeline: + name: "entry-pipeline" + buffer: + bounded_blocking: + # Configure the same value as in entry-pipeline + # Make sure you configure sufficient heap + # The default value is 512 + buffer_size: 512 + # The raw processor does bulk request to your OpenSearch sink, so configure the batch_size higher. + # If you use the recommended otel-collector setup each ExportTraceRequest could contain max 50 spans. https://github.com/opensearch-project/data-prepper/tree/v0.7.x/deployment/aws + # With 64 as batch size each worker thread could process upto 3200 spans (64 * 50) + batch_size: 64 + processor: + - otel_traces_raw: + - otel_traces_group: + hosts: [ "https://localhost:9200" ] + # Change to your credentials + username: "admin" + password: "admin" + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + #cert: /path/to/cert + # If you are connecting to an Amazon OpenSearch Service domain without + # Fine-Grained Access Control, enable these settings. Comment out the + # username and password above. + #aws_sigv4: true + #aws_region: us-east-1 + sink: + - opensearch: + hosts: [ "https://localhost:9200" ] + index_type: trace-analytics-raw + # Change to your credentials + username: "admin" + password: "admin" + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + #cert: /path/to/cert + # If you are connecting to an Amazon OpenSearch Service domain without + # Fine-Grained Access Control, enable these settings. Comment out the + # username and password above. + #aws_sigv4: true + #aws_region: us-east-1 +service-map-pipeline: + workers: 8 + delay: "100" + source: + pipeline: + name: "entry-pipeline" + processor: + - service_map_stateful: + # The window duration is the maximum length of time the data prepper stores the most recent trace data to evaluvate service-map relationships. + # The default is 3 minutes, this means we can detect relationships between services from spans reported in last 3 minutes. + # Set higher value if your applications have higher latency. + window_duration: 180 + buffer: + bounded_blocking: + # buffer_size is the number of ExportTraceRequest from otel-collector the data prepper should hold in memeory. + # We recommend to keep the same buffer_size for all pipelines. + # Make sure you configure sufficient heap + # default value is 512 + buffer_size: 512 + # This is the maximum number of request each worker thread will process within the delay. + # Default is 8. + # Make sure buffer_size >= workers * batch_size + batch_size: 8 + sink: + - opensearch: + hosts: [ "https://localhost:9200" ] + index_type: trace-analytics-service-map + # Change to your credentials + username: "admin" + password: "admin" + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + #cert: /path/to/cert + # If you are connecting to an Amazon OpenSearch Service domain without + # Fine-Grained Access Control, enable these settings. Comment out the + # username and password above. + #aws_sigv4: true + #aws_region: us-east-1 +``` + +You need to modify the preceding configuration for your OpenSearch cluster so that the configuration matches your environment. Note that it has two `opensearch` sinks that need to be modified. +{: .note} + +You must make the following changes: +* `hosts` – Set to your hosts. +* `username` – Provide your OpenSearch username. +* `password` – Provide your OpenSearch password. +* `aws_sigv4` – If you are using Amazon OpenSearch Service with AWS signing, set this value to `true`. It will sign requests with the default AWS credentials provider. +* `aws_region` – If you are using Amazon OpenSearch Service with AWS signing, set this value to your AWS Region. + +For other configurations available for OpenSearch sinks, see [Data Prepper OpenSearch sink]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sinks/opensearch/). + +## OpenTelemetry Collector + +You need to run OpenTelemetry Collector in your service environment. Follow [Getting Started](https://opentelemetry.io/docs/collector/getting-started/#getting-started) to install an OpenTelemetry collector. Ensure that you configure the collector with an exporter configured for your Data Prepper instance. The following example `otel-collector-config.yaml` file receives data from various instrumentations and exports it to Data Prepper. + +### Example otel-collector-config.yaml file + +The following is an example `otel-collector-config.yaml` file: + +``` +receivers: + jaeger: + protocols: + grpc: + otlp: + protocols: + grpc: + zipkin: + +processors: + batch/traces: + timeout: 1s + send_batch_size: 50 + +exporters: + otlp/data-prepper: + endpoint: localhost:21890 + tls: + insecure: true + +service: + pipelines: + traces: + receivers: [jaeger, otlp, zipkin] + processors: [batch/traces] + exporters: [otlp/data-prepper] +``` + +After you run OpenTelemetry in your service environment, you must configure your application to use the OpenTelemetry Collector. The OpenTelemetry Collector typically runs alongside your application. + +## Next steps and more information + +The [OpenSearch Dashboards Observability plugin]({{site.url}}{{site.baseurl}}/observability-plugin/trace/ta-dashboards/) documentation provides additional information about configuring OpenSearch to view trace analytics in OpenSearch Dashboards. + +For more information about how to tune and scale Data Prepper for trace analytics, see [Trace tuning](#trace-tuning). + +## Migrating to Data Prepper 2.0 + +Starting with Data Prepper version 1.4, trace processing uses Data Prepper's event model. This allows pipeline authors to configure other processors to modify spans or traces. To provide a migration path, Data Prepper version 1.4 introduced the following changes: + +* `otel_traces_source` has an optional `record_type` parameter that can be set to `event`. When configured, it will output event objects. +* `otel_traces_raw` replaces `otel_traces_raw_prepper` for event-based spans. +* `otel_traces_group` replaces `otel_traces_group_prepper` for event-based spans. + +In Data Prepper version 2.0, `otel_traces_source` will only output events. Data Prepper version 2.0 also removes `otel_traces_raw_prepper` and `otel_traces_group_prepper` entirely. To migrate to Data Prepper version 2.0, you can configure your trace pipeline using the event model. + \ No newline at end of file diff --git a/_data-prepper/getting-started.md b/_data-prepper/getting-started.md new file mode 100644 index 00000000..624cd5fc --- /dev/null +++ b/_data-prepper/getting-started.md @@ -0,0 +1,160 @@ +--- +layout: default +title: Getting started +nav_order: 5 +redirect_from: + - /clients/data-prepper/get-started/ +--- + +# Getting started with Data Prepper + +Data Prepper is an independent component, not an OpenSearch plugin, that converts data for use with OpenSearch. It's not bundled with the all-in-one OpenSearch installation packages. + +If you are migrating from Open Distro Data Prepper, see [Migrating from Open Distro]({{site.url}}{{site.baseurl}}/data-prepper/migrate-open-distro/). +{: .note} + +## 1. Installing Data Prepper + +There are two ways to install Data Prepper: you can run the Docker image or build from source. + +The easiest way to use Data Prepper is by running the Docker image. We suggest that you use this approach if you have [Docker](https://www.docker.com) available. Run the following command: + +``` +docker pull opensearchproject/data-prepper:latest +``` +{% include copy.html %} + +If you have special requirements that require you to build from source, or if you want to contribute, see the [Developer Guide](https://github.com/opensearch-project/data-prepper/blob/main/docs/developer_guide.md). + +## 2. Configuring Data Prepper + +Two configuration files are required to run a Data Prepper instance. Optionally, you can configure a Log4j 2 configuration file. See [Configuring Log4j]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/configuring-log4j/) for more information. The following list describes the purpose of each configuration file: + +* `pipelines.yaml`: This file describes which data pipelines to run, including sources, processors, and sinks. +* `data-prepper-config.yaml`: This file contains Data Prepper server settings that allow you to interact with exposed Data Prepper server APIs. +* `log4j2-rolling.properties` (optional): This file contains Log4j 2 configuration options and can be a JSON, YAML, XML, or .properties file type. + +For Data Prepper versions earlier than 2.0, the `.jar` file expects the pipeline configuration file path to be followed by the server configuration file path. See the following configuration path example: + +``` +java -jar data-prepper-core-$VERSION.jar pipelines.yaml data-prepper-config.yaml +``` + +Optionally, you can add `"-Dlog4j.configurationFile=config/log4j2.properties"` to the command to pass a custom Log4j 2 configuration file. If you don't provide a properties file, Data Prepper defaults to the `log4j2.properties` file in the `shared-config` directory. + + +Starting with Data Prepper 2.0, you can launch Data Prepper by using the following `data-prepper` script that does not require any additional command line arguments: + +``` +bin/data-prepper +``` + +Configuration files are read from specific subdirectories in the application's home directory: +1. `pipelines/`: Used for pipeline configurations. Pipeline configurations can be written in one or more YAML files. +2. `config/data-prepper-config.yaml`: Used for the Data Prepper server configuration. + +You can supply your own pipeline configuration file path followed by the server configuration file path. However, this method will not be supported in a future release. See the following example: +``` +bin/data-prepper pipelines.yaml data-prepper-config.yaml +``` + +The Log4j 2 configuration file is read from the `config/log4j2.properties` file located in the application's home directory. + +To configure Data Prepper, see the following information for each use case: + +* [Trace analytics]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/trace-analytics/): Learn how to collect trace data and customize a pipeline that ingests and transforms that data. +* [Log analytics]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/log-analytics/): Learn how to set up Data Prepper for log observability. + +## 3. Defining a pipeline + +Create a Data Prepper pipeline file named `pipelines.yaml` using the following configuration: + +```yml +simple-sample-pipeline: + workers: 2 + delay: "5000" + source: + random: + sink: + - stdout: +``` +{% include copy.html %} + +## 4. Running Data Prepper + +Run the following command with your pipeline configuration YAML. + +```bash +docker run --name data-prepper \ + -v /${PWD}/pipelines.yaml:/usr/share/data-prepper/pipelines/pipelines.yaml \ + opensearchproject/data-prepper:latest + +``` +{% include copy.html %} + +The example pipeline configuration above demonstrates a simple pipeline with a source (`random`) sending data to a sink (`stdout`). For examples of more advanced pipeline configurations, see [Pipelines]({{site.url}}{{site.baseurl}}/clients/data-prepper/pipelines/). + +After starting Data Prepper, you should see log output and some UUIDs after a few seconds: + +```yml +2021-09-30T20:19:44,147 [main] INFO com.amazon.dataprepper.pipeline.server.DataPrepperServer - Data Prepper server running at :4900 +2021-09-30T20:19:44,681 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer +2021-09-30T20:19:45,183 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer +2021-09-30T20:19:45,687 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer +2021-09-30T20:19:46,191 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer +2021-09-30T20:19:46,694 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer +2021-09-30T20:19:47,200 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer +2021-09-30T20:19:49,181 [simple-test-pipeline-processor-worker-1-thread-1] INFO com.amazon.dataprepper.pipeline.ProcessWorker - simple-test-pipeline Worker: Processing 6 records from buffer +07dc0d37-da2c-447e-a8df-64792095fb72 +5ac9b10a-1d21-4306-851a-6fb12f797010 +99040c79-e97b-4f1d-a70b-409286f2a671 +5319a842-c028-4c17-a613-3ef101bd2bdd +e51e700e-5cab-4f6d-879a-1c3235a77d18 +b4ed2d7e-cf9c-4e9d-967c-b18e8af35c90 +``` +The remainder of this page provides examples for running Data Prepper from the Docker image. If you +built it from source, refer to the [Developer Guide](https://github.com/opensearch-project/data-prepper/blob/main/docs/developer_guide.md) for more information. + +However you configure your pipeline, you'll run Data Prepper the same way. You run the Docker +image and modify both the `pipelines.yaml` and `data-prepper-config.yaml` files. + +For Data Prepper 2.0 or later, use this command: + +``` +docker run --name data-prepper -p 4900:4900 -v ${PWD}/pipelines.yaml:/usr/share/data-prepper/pipelines/pipelines.yaml -v ${PWD}/data-prepper-config.yaml:/usr/share/data-prepper/config/data-prepper-config.yaml opensearchproject/data-prepper:latest +``` +{% include copy.html %} + +For Data Prepper versions earlier than 2.0, use this command: + +``` +docker run --name data-prepper -p 4900:4900 -v ${PWD}/pipelines.yaml:/usr/share/data-prepper/pipelines.yaml -v ${PWD}/data-prepper-config.yaml:/usr/share/data-prepper/data-prepper-config.yaml opensearchproject/data-prepper:1.x +``` +{% include copy.html %} + +Once Data Prepper is running, it processes data until it is shut down. Once you are done, shut it down with the following command: + +``` +POST /shutdown +``` +{% include copy-curl.html %} + +### Additional configurations + +For Data Prepper 2.0 or later, the Log4j 2 configuration file is read from `config/log4j2.properties` in the application's home directory. By default, it uses `log4j2-rolling.properties` in the *shared-config* directory. + +For Data Prepper 1.5 or earlier, optionally add `"-Dlog4j.configurationFile=config/log4j2.properties"` to the command if you want to pass a custom log4j2 properties file. If no properties file is provided, Data Prepper defaults to the log4j2.properties file in the *shared-config* directory. + +## Next steps + +Trace analytics is an important Data Prepper use case. If you haven't yet configured it, see [Trace analytics]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/trace-analytics/). + +Log ingestion is also an important Data Prepper use case. To learn more, see [Log analytics]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/log-analytics/). + +To learn how to run Data Prepper with a Logstash configuration, see [Migrating from Logstash]({{site.url}}{{site.baseurl}}/data-prepper/migrating-from-logstash-data-prepper/). + +For information on how to monitor Data Prepper, see [Monitoring]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/monitoring/). + +## More examples + +For more examples of Data Prepper, see [examples](https://github.com/opensearch-project/data-prepper/tree/main/examples/) in the Data Prepper repo. diff --git a/_data-prepper/index.md b/_data-prepper/index.md new file mode 100644 index 00000000..0c7a228b --- /dev/null +++ b/_data-prepper/index.md @@ -0,0 +1,96 @@ +--- +layout: default +title: Data Prepper +nav_order: 1 +has_children: false +has_toc: false +nav_exclude: true +permalink: /data-prepper/ +redirect_from: + - /clients/data-prepper/index/ + - /monitoring-plugins/trace/data-prepper/ + - /data-prepper/index/ +--- + +# Data Prepper + +Data Prepper is a server-side data collector capable of filtering, enriching, transforming, normalizing, and aggregating data for downstream analytics and visualization. + +Data Prepper lets users build custom pipelines to improve the operational view of applications. Two common uses for Data Prepper are trace and log analytics. [Trace analytics]({{site.url}}{{site.baseurl}}/observability-plugin/trace/index/) can help you visualize the flow of events and identify performance problems, and [log analytics]({{site.url}}{{site.baseurl}}/observability-plugin/log-analytics/) can improve searching, analyzing and provide insights into your application. + +## Concepts + +Data Prepper includes one or more **pipelines** that collect and filter data based on the components set within the pipeline. Each component is pluggable, enabling you to use your own custom implementation of each component. These components include the following: + +- One [source](#source) +- One or more [sinks](#sink) +- (Optional) One [buffer](#buffer) +- (Optional) One or more [processors](#processor) + +A single instance of Data Prepper can have one or more pipelines. + +Each pipeline definition contains two required components: **source** and **sink**. If buffers and processors are missing from the Data Prepper pipeline, Data Prepper uses the default buffer and a no-op processor. + +### Source + +Source is the input component that defines the mechanism through which a Data Prepper pipeline will consume events. A pipeline can have only one source. The source can consume events either by receiving the events over HTTP or HTTPS or by reading from external endpoints like OTeL Collector for traces and metrics and Amazon Simple Storage Service (Amazon S3). Sources have their own configuration options based on the format of the events (such as string, JSON, Amazon CloudWatch logs, or open telemetry trace). The source component consumes events and writes them to the buffer component. + +### Buffer + +The buffer component acts as the layer between the source and the sink. Buffer can be either in-memory or disk based. The default buffer uses an in-memory queue called `bounded_blocking` that is bounded by the number of events. If the buffer component is not explicitly mentioned in the pipeline configuration, Data Prepper uses the default `bounded_blocking`. + +### Sink + +Sink is the output component that defines the destination(s) to which a Data Prepper pipeline publishes events. A sink destination could be a service, such as OpenSearch or Amazon S3, or another Data Prepper pipeline. When using another Data Prepper pipeline as the sink, you can chain multiple pipelines together based on the needs of the data. Sink contains its own configuration options based on the destination type. + +### Processor + +Processors are units within the Data Prepper pipeline that can filter, transform, and enrich events using your desired format before publishing the record to the sink component. The processor is not defined in the pipeline configuration; the events publish in the format defined in the source component. You can have more than one processor within a pipeline. When using multiple processors, the processors are run in the order they are defined inside the pipeline specification. + +## Sample pipeline configurations + +To understand how all pipeline components function within a Data Prepper configuration, see the following examples. Each pipeline configuration uses a `yaml` file format. + +### Minimal component + +This pipeline configuration reads from the file source and writes to another file in the same path. It uses the default options for the buffer and processor. + +```yml +sample-pipeline: + source: + file: + path: + sink: + - file: + path: +``` + +### All components + +The following pipeline uses a source that reads string events from the `input-file`. The source then pushes the data to the buffer, bounded by a max size of `1024`. The pipeline is configured to have `4` workers, each of them reading a maximum of `256` events from the buffer for every `100 milliseconds`. Each worker runs the `string_converter` processor and writes the output of the processor to the `output-file`. + +```yml +sample-pipeline: + workers: 4 #Number of workers + delay: 100 # in milliseconds, how often the workers should run + source: + file: + path: + buffer: + bounded_blocking: + buffer_size: 1024 # max number of events the buffer will accept + batch_size: 256 # max number of events the buffer will drain for each read + processor: + - string_converter: + upper_case: true + sink: + - file: + path: +``` + +## Next steps + +To get started building your own custom pipelines with Data Prepper, see [Getting started]({{site.url}}{{site.baseurl}}/clients/data-prepper/get-started/). + + + diff --git a/_data-prepper/managing-data-prepper/configuring-data-prepper.md b/_data-prepper/managing-data-prepper/configuring-data-prepper.md new file mode 100644 index 00000000..bcff65ed --- /dev/null +++ b/_data-prepper/managing-data-prepper/configuring-data-prepper.md @@ -0,0 +1,208 @@ +--- +layout: default +title: Configuring Data Prepper +parent: Managing Data Prepper +nav_order: 5 +redirect_from: + - /clients/data-prepper/data-prepper-reference/ + - /monitoring-plugins/trace/data-prepper-reference/ +--- + +# Configuring Data Prepper + +You can customize your Data Prepper configuration by editing the `data-prepper-config.yaml` file in your Data Prepper installation. The following configuration options are independent from pipeline configuration options. + + +## Data Prepper configuration + +Use the following options to customize your Data Prepper configuration. + +Option | Required | Type | Description +:--- | :--- |:--- | :--- +ssl | No | Boolean | Indicates whether TLS should be used for server APIs. Defaults to true. +keyStoreFilePath | No | String | The path to a .jks or .p12 keystore file. Required if `ssl` is true. +keyStorePassword | No | String | The password for keystore. Optional, defaults to empty string. +privateKeyPassword | No | String | The password for a private key within keystore. Optional, defaults to empty string. +serverPort | No | Integer | The port number to use for server APIs. Defaults to 4900. +metricRegistries | No | List | The metrics registries for publishing the generated metrics. Currently supports Prometheus and Amazon CloudWatch. Defaults to Prometheus. +metricTags | No | Map | A map of key-value pairs as common metric tags to metric registries. The maximum number of pairs is three. Note that `serviceName` is a reserved tag key with `DataPrepper` as the default tag value. Alternatively, administrators can set this value through the environment variable `DATAPREPPER_SERVICE_NAME`. If `serviceName` is defined in `metricTags`, that value overwrites those set through the above methods. +authentication | No | Object | The authentication configuration. Valid option is `http_basic` with `username` and `password` properties. If not defined, the server does not perform authentication. +processorShutdownTimeout | No | Duration | The time given to processors to clear any in-flight data and gracefully shut down. Default is 30s. +sinkShutdownTimeout | No | Duration | The time given to sinks to clear any in-flight data and gracefully shut down. Default is 30s. +peer_forwarder | No | Object | Peer forwarder configurations. See [Peer forwarder options](#peer-forwarder-options) for more details. +circuit_breakers | No | [circuit_breakers](#circuit-breakers) | Configures a circuit breaker on incoming data. +extensions | No | Object | The pipeline extension plugin configurations. See [Extension plugins](#extension-plugins) for more details. + +### Peer forwarder options + +The following section details various configuration options for peer forwarder. + +#### General options for peer forwarding + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +port | No | Integer | The peer forwarding server port. Valid options are between 0 and 65535. Defaults is 4994. +request_timeout | No | Integer | The request timeout for the peer forwarder HTTP server in milliseconds. Default is 10000. +server_thread_count | No | Integer | The number of threads used by the peer forwarder server. Default is 200. +client_thread_count | No | Integer | The number of threads used by the peer forwarder client. Default is 200. +max_connection_count | No | Integer | The maximum number of open connections for the peer forwarder server. Default is 500. +max_pending_requests | No | Integer | The maximum number of allowed tasks in ScheduledThreadPool work queue. Default is 1024. +discovery_mode | No | String | The peer discovery mode to use. Valid options are `local_node`, `static`, `dns`, or `aws_cloud_map`. Defaults to `local_node`, which processes events locally. +static_endpoints | Conditionally | List | A list containing endpoints of all Data Prepper instances. Required if `discovery_mode` is set to static. +domain_name | Conditionally | String | A single domain name to query DNS against. Typically, used by creating multiple DNS A Records for the same domain. Required if `discovery_mode` is set to dns. +aws_cloud_map_namespace_name | Conditionally | String | Cloud Map namespace when using AWS Cloud Map service discovery. Required if `discovery_mode` is set to `aws_cloud_map`. +aws_cloud_map_service_name | Conditionally | String | The Cloud Map service name when using AWS Cloud Map service discovery. Required if `discovery_mode` is set to `aws_cloud_map`. +aws_cloud_map_query_parameters | No | Map | A map of key-value pairs to filter the results based on the custom attributes attached to an instance. Only instances that match all the specified key-value pairs are returned. +buffer_size | No | Integer | The maximum number of unchecked records the buffer accepts. Number of unchecked records is the sum of the number of records written into the buffer and the num of in-flight records not yet checked by the Checkpointing API. Default is 512. +batch_size | No | Integer | The maximum number of records the buffer returns on read. Default is 48. +aws_region | Conditionally | String | The AWS region to use with ACM, S3 or AWS Cloud Map. Required if `use_acm_certificate_for_ssl` is set to true or `ssl_certificate_file` and `ssl_key_file` is AWS S3 path or `discovery_mode` is set to `aws_cloud_map`. +drain_timeout | No | Duration | The wait time for the peer forwarder to complete processing data before shutdown. Default is `10s`. + +#### TLS/SSL options for peer forwarder + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +ssl | No | Boolean | Enables TLS/SSL. Default is `true`. +ssl_certificate_file | Conditionally | String | The SSL certificate chain file path or AWS S3 path. S3 path example `s3:///`. Required if `ssl` is true and `use_acm_certificate_for_ssl` is false. Defaults to `config/default_certificate.pem` which is the default certificate file. Read more about how the certificate file is generated [here](https://github.com/opensearch-project/data-prepper/tree/main/examples/certificates). +ssl_key_file | Conditionally | String | The SSL key file path or AWS S3 path. S3 path example `s3:///`. Required if `ssl` is true and `use_acm_certificate_for_ssl` is false. Defaults to `config/default_private_key.pem` which is the default private key file. Read more about how the default private key file is generated [here](https://github.com/opensearch-project/data-prepper/tree/main/examples/certificates). +ssl_insecure_disable_verification | No | Boolean | Disables the verification of server's TLS certificate chain. Default is false. +ssl_fingerprint_verification_only | No | Boolean | Disables the verification of server's TLS certificate chain and instead verifies only the certificate fingerprint. Default is false. +use_acm_certificate_for_ssl | No | Boolean | Enables TLS/SSL using certificate and private key from AWS Certificate Manager (ACM). Default is false. +acm_certificate_arn | Conditionally | String | The ACM certificate ARN. The ACM certificate takes preference over S3 or a local file system certificate. Required if `use_acm_certificate_for_ssl` is set to true. +acm_private_key_password | No | String | The ACM private key password that decrypts the private key. If not provided, Data Prepper generates a random password. +acm_certificate_timeout_millis | No | Integer | The timeout in milliseconds for ACM to get certificates. Default is 120000. +aws_region | Conditionally | String | The AWS region to use ACM, S3 or AWS Cloud Map. Required if `use_acm_certificate_for_ssl` is set to true or `ssl_certificate_file` and `ssl_key_file` is AWS S3 path or `discovery_mode` is set to `aws_cloud_map`. + +#### Authentication options for peer forwarder + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +authentication | No | Map | The authentication method to use. Valid options are `mutual_tls` (use mTLS) or `unauthenticated` (no authentication). Default is `unauthenticated`. + +### Circuit breakers + +Data Prepper provides a circuit breaker to help prevent exhausting Java memory. And is useful when pipelines have stateful processors as these can retain memory usage outside of the buffers. + +When a circuit breaker is tripped, Data Prepper rejects incoming data routing into buffers. + + +Option | Required | Type | Description +:--- | :--- |:---| :--- +heap | No | [heap](#heap-circuit-breaker) | Enables a heap circuit breaker. By default, this is not enabled. + + +#### Heap circuit breaker + +Configures Data Prepper to trip a circuit breaker when JVM heap reaches a specified usage threshold. + +Option | Required | Type | Description +:--- |:---|:---| :--- +usage | Yes | Bytes | Specifies the JVM heap usage at which to trip a circuit breaker. If the current Java heap usage exceeds this value then the circuit breaker will be open. This can be a value such as `6.5gb`. +reset | No | Duration | After tripping the circuit breaker, no new checks are made until after this time has passed. This effectively sets the minimum time for a breaker to remain open to allow for clearing memory. Defaults to `1s`. +check_interval | No | Duration | Specifies the time between checks of the heap size. Defaults to `500ms`. + +### Extension plugins + +Since Data Prepper 2.5, Data Prepper provides support for user configurable extension plugins. Extension plugins are shared common +configurations shared across pipeline plugins, such as [sources, buffers, processors, and sinks]({{site.url}}{{site.baseurl}}/data-prepper/index/#concepts). + +### AWS extension plugins + +To use the AWS extension plugin, add the following setting to your `data-prepper-config.yaml` under `aws`. + +Option | Required | Type | Description +:--- |:---|:---| :--- +aws | No | Object | The AWS extension plugins configuration. + +#### AWS secrets extension plugin + +The AWS secrets extension plugin configures the [AWS Secrets Manager](https://docs.aws.amazon.com/secretsmanager/latest/userguide/intro.html) to be +referenced in pipeline plugin configurations, as shown in the following example: + +```json +extensions: + aws: + secrets: + : + secret_id: + region: + sts_role_arn: + refresh_interval: + : + ... +``` + +To use the secrets extension plugin, add the following setting to your `pipeline.yaml` under `extensions` > `aws`. + +Option | Required | Type | Description +:--- |:---|:---| :--- +secrets | No | Object | The AWS Secrets Manager extension plugin configuration. See [Secrets](#secrets) for more details. + +### Secrets + +Use the following settings under the `secrets` extension setting. + + +Option | Required | Type | Description +:--- |:---|:---| :--- +secret_id | Yes | String | The AWS secret name or ARN. | +region | No | String | The AWS region of the secret. Defaults to `us-east-1`. +sts_role_arn | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to the AWS Secrets Manager. Defaults to `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +refresh_interval | No | Duration | The refreshment interval for AWS secrets extension plugin to poll new secret values. Defaults to `PT1H`. See [Automatically refreshing secrets](#automatically-refreshing-secrets) for details. + +#### Reference secrets +ß +In `pipelines.yaml`, secret values can be referenced within the pipeline plugins using the following formats: + +* plaintext: `{% raw %}${{aws_secrets:}}{% endraw %}`. +* JSON (key-value pairs): `{% raw %}${{aws_secrets::}}{% endraw %}` + + +Replace `` with the corresponding secret config ID under `/extensions/aws/secrets`. Replace `` with the desired key in the secret JSON value. The secret value reference string format can be interpreted for the following plugin setting data types: + +* String +* Number +* Long +* Short +* Integer +* Double +* Float +* Boolean +* Character + +The following example section of `data-prepper-config.yaml` names two secret config IDs, `host-secret-config` and `credential-secret-config`: + + +```json +extensions: + aws: + secrets: + host-secret-config: + secret_id: + region: + sts_role_arn: + refresh_interval: + credential-secret-config: + secret_id: + region: + sts_role_arn: + refresh_interval: +``` + +After `` is configured, you can reference the IDs in your `pipelines.yaml`: + +``` +sink: + - opensearch: + hosts: [ {% raw %}"${{aws_secrets:host-secret-config}}"{% endraw %} ] + username: {% raw %}"${{aws_secrets:credential-secret-config:username}}"{% endraw %} + password: {% raw %}"${{aws_secrets:credential-secret-config:password}}"{% endraw %} + index: "test-migration" +``` + + +#### Automatically refreshing secrets + +For each individual secret configuration, the latest secret value is polled on a regular interval to support refreshing secrets in AWS Secrets Manager. The refreshed secret values are utilized by certain pipeline plugins to refresh their components, such as connection and authentication to the backend service. + +For multiple secret configurations, jitter within `60s` will be applied across all configurations during the initial secrets polling. diff --git a/_data-prepper/managing-data-prepper/configuring-log4j.md b/_data-prepper/managing-data-prepper/configuring-log4j.md new file mode 100644 index 00000000..175c754a --- /dev/null +++ b/_data-prepper/managing-data-prepper/configuring-log4j.md @@ -0,0 +1,29 @@ +--- +layout: default +title: Configuring Log4j +parent: Managing Data Prepper +nav_order: 20 +--- + +# Configuring Log4j + +You can configure logging using Log4j in Data Prepper. + +## Logging + +Data Prepper uses [SLF4J](https://www.slf4j.org/) with a [Log4j 2 binding](https://logging.apache.org/log4j/2.x/log4j-slf4j-impl.html). + +For Data Prepper versions 2.0 and later, the Log4j 2 configuration file can be found and edited in `config/log4j2.properties` in the application's home directory. The default properties for Log4j 2 can be found in `log4j2-rolling.properties` in the *shared-config* directory. + +For Data Prepper versions before 2.0, the Log4j 2 configuration file can be overridden by setting the `log4j.configurationFile` system property when running Data Prepper. The default properties for Log4j 2 can be found in `log4j2.properties` in the *shared-config* directory. + +### Example + +When running Data Prepper, the following command can be overridden by setting the system property `-Dlog4j.configurationFile={property_value}`, where `{property_value}` is a path to the Log4j 2 configuration file: + +``` +java "-Dlog4j.configurationFile=config/custom-log4j2.properties" -jar data-prepper-core-$VERSION.jar pipelines.yaml data-prepper-config.yaml +``` + +See the [Log4j 2 configuration documentation](https://logging.apache.org/log4j/2.x/manual/configuration.html) for more information about Log4j 2 configuration. + diff --git a/_data-prepper/managing-data-prepper/core-apis.md b/_data-prepper/managing-data-prepper/core-apis.md new file mode 100644 index 00000000..b810c7b1 --- /dev/null +++ b/_data-prepper/managing-data-prepper/core-apis.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Core APIs +parent: Managing Data Prepper +nav_order: 15 +--- + +# Core APIs + +All Data Prepper instances expose a server with some control APIs. By default, this server runs on port 4900. Some plugins, especially source plugins, may expose other servers that run on different ports. Configurations for these plugins are independent of the core API. For example, to shut down Data Prepper, you can run the following curl request: + +``` +curl -X POST http://localhost:4900/shutdown +``` + +## APIs + +The following table lists the available APIs. + +| Name | Description | +| --- | --- | +| ```GET /list```
```POST /list``` | Returns a list of running pipelines. | +| ```POST /shutdown``` | Starts a graceful shutdown of Data Prepper. | +| ```GET /metrics/prometheus```
```POST /metrics/prometheus``` | Returns a scrape of Data Prepper metrics in Prometheus text format. This API is available as a `metricsRegistries` parameter in the Data Prepper configuration file `data-prepper-config.yaml` and contains `Prometheus` as part of the registry. +| ```GET /metrics/sys```
```POST /metrics/sys``` | Returns JVM metrics in Prometheus text format. This API is available as a `metricsRegistries` parameter in the Data Prepper configuration file `data-prepper-config.yaml` and contains `Prometheus` as part of the registry. + +## Configuring the server + +You can configure your Data Prepper core APIs through the `data-prepper-config.yaml` file. + +### SSL/TLS connection + +Many of the getting started guides for this project disable SSL on the endpoint: + +```yaml +ssl: false +``` + +To enable SSL on your Data Prepper endpoint, configure your `data-prepper-config.yaml` file with the following options: + +```yaml +ssl: true +keyStoreFilePath: "/usr/share/data-prepper/keystore.p12" +keyStorePassword: "secret" +privateKeyPassword: "secret" +``` + +For more information about configuring your Data Prepper server with SSL, see [Server Configuration](https://github.com/opensearch-project/data-prepper/blob/main/docs/configuration.md#server-configuration). If you are using a self-signed certificate, you can add the `-k` flag to the request to quickly test core APIs with SSL. Use the following `shutdown` request to test core APIs with SSL: + + +``` +curl -k -X POST https://localhost:4900/shutdown +``` + +### Authentication + +The Data Prepper core APIs support HTTP basic authentication. You can set the username and password with the following configuration in the `data-prepper-config.yaml` file: + +```yaml +authentication: + http_basic: + username: "myuser" + password: "mys3cr3t" +``` + +You can disable authentication of core endpoints using the following configuration. Use this with caution because the shutdown API and others will be accessible to anybody with network access to your Data Prepper instance. + +```yaml +authentication: + unauthenticated: +``` + +### Peer Forwarder + +Peer Forwarder can be configured to enable stateful aggregation across multiple Data Prepper nodes. For more information about configuring Peer Forwarder, see [Peer forwarder]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/peer-forwarder/). It is supported by the `service_map_stateful`, `otel_traces_raw`, and `aggregate` processors. + +### Shutdown timeouts + +When you run the Data Prepper `shutdown` API, the process gracefully shuts down and clears any remaining data for both the `ExecutorService` sink and `ExecutorService` processor. The default timeout for shutdown of both processes is 10 seconds. You can configure the timeout with the following optional `data-prepper-config.yaml` file parameters: + +```yaml +processorShutdownTimeout: "PT15M" +sinkShutdownTimeout: 30s +``` + +The values for these parameters are parsed into a `Duration` object through the [Data Prepper Duration Deserializer](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-pipeline-parser/src/main/java/org/opensearch/dataprepper/pipeline/parser/DataPrepperDurationDeserializer.java). diff --git a/_data-prepper/managing-data-prepper/managing-data-prepper.md b/_data-prepper/managing-data-prepper/managing-data-prepper.md new file mode 100644 index 00000000..ea2d1f11 --- /dev/null +++ b/_data-prepper/managing-data-prepper/managing-data-prepper.md @@ -0,0 +1,10 @@ +--- +layout: default +title: Managing Data Prepper +has_children: true +nav_order: 20 +--- + +# Managing Data Prepper + +You can perform administrator functions for Data Prepper, including system configuration, interacting with core APIs, Log4j configuration, and monitoring. You can set up peer forwarding to coordinate multiple Data Prepper nodes when using stateful aggregation. \ No newline at end of file diff --git a/_data-prepper/managing-data-prepper/monitoring.md b/_data-prepper/managing-data-prepper/monitoring.md new file mode 100644 index 00000000..691f376b --- /dev/null +++ b/_data-prepper/managing-data-prepper/monitoring.md @@ -0,0 +1,59 @@ +--- +layout: default +title: Monitoring +parent: Managing Data Prepper +nav_order: 25 +--- + +# Monitoring Data Prepper with metrics + +You can monitor Data Prepper with metrics using [Micrometer](https://micrometer.io/). There are two types of metrics: JVM/system metrics and plugin metrics. [Prometheus](https://prometheus.io/) is used as the default metrics backend. + +## JVM and system metrics + +JVM and system metrics are runtime metrics that are used to monitor Data Prepper instances. They include metrics for classloaders, memory, garbage collection, threads, and others. For more information, see [JVM and system metrics](https://micrometer.io/?/docs/ref/jvm). + +### Naming + +JVM and system metrics follow predefined names in [Micrometer](https://micrometer.io/?/docs/concepts#_naming_meters). For example, the Micrometer metrics name for memory usage is `jvm.memory.used`. Micrometer changes the name to match the metrics system. Following the same example, `jvm.memory.used` is reported to Prometheus as `jvm_memory_used`, and is reported to Amazon CloudWatch as `jvm.memory.used.value`. + +### Serving + +By default, metrics are served from the **/metrics/sys** endpoint on the Data Prepper server in Prometheus scrape format. You can configure Prometheus to scrape from the Data Prepper URL. Prometheus then polls Data Prepper for metrics and stores them in its database. To visualize the data, you can set up any frontend that accepts Prometheus metrics, such as [Grafana](https://prometheus.io/docs/visualization/grafana/). You can update the configuration to serve metrics to other registries like Amazon CloudWatch, which does not require or host the endpoint but publishes the metrics directly to CloudWatch. + +## Plugin metrics + +Plugins report their own metrics. Data Prepper uses a naming convention to help with consistency in the metrics. Plugin metrics do not use dimensions. + + +1. AbstractBuffer + - Counter + - `recordsWritten`: The number of records written into a buffer + - `recordsRead`: The number of records read from a buffer + - `recordsProcessed`: The number of records read from a buffer and marked as processed + - `writeTimeouts`: The count of write timeouts in a buffer + - Gaugefir + - `recordsInBuffer`: The number of records in a buffer + - `recordsInFlight`: The number of records read from a buffer and being processed by data-prepper downstreams (for example, processor, sink) + - Timer + - `readTimeElapsed`: The time elapsed while reading from a buffer + - `checkpointTimeElapsed`: The time elapsed while checkpointing +2. AbstractProcessor + - Counter + - `recordsIn`: The number of records ingressed into a processor + - `recordsOut`: The number of records egressed from a processor + - Timer + - `timeElapsed`: The time elapsed during initiation of a processor +3. AbstractSink + - Counter + - `recordsIn`: The number of records ingressed into a sink + - Timer + - `timeElapsed`: The time elapsed during execution of a sink + +### Naming + +Metrics follow a naming convention of **PIPELINE_NAME_PLUGIN_NAME_METRIC_NAME**. For example, a **recordsIn** metric for the **opensearch-sink** plugin in a pipeline named **output-pipeline** has a qualified name of **output-pipeline_opensearch_sink_recordsIn**. + +### Serving + +By default, metrics are served from the **/metrics/sys** endpoint on the Data Prepper server in a Prometheus scrape format. You can configure Prometheus to scrape from the Data Prepper URL. The Data Prepper server port has a default value of `4900` that you can modify, and this port can be used for any frontend that accepts Prometheus metrics, such as [Grafana](https://prometheus.io/docs/visualization/grafana/). You can update the configuration to serve metrics to other registries like CloudWatch, that does not require or host the endpoint, but publishes the metrics directly to CloudWatch. \ No newline at end of file diff --git a/_data-prepper/managing-data-prepper/peer-forwarder.md b/_data-prepper/managing-data-prepper/peer-forwarder.md new file mode 100644 index 00000000..f6a0f989 --- /dev/null +++ b/_data-prepper/managing-data-prepper/peer-forwarder.md @@ -0,0 +1,182 @@ +--- +layout: default +title: Peer forwarder +nav_order: 12 +parent: Managing Data Prepper +--- + +# Peer forwarder + +Peer forwarder is an HTTP service that performs peer forwarding of an `event` between Data Prepper nodes for aggregation. This HTTP service uses a hash-ring approach to aggregate events and determine which Data Prepper node it should handle on a given trace before rerouting it to that node. Currently, peer forwarder is supported by the `aggregate`, `service_map_stateful`, and `otel_traces_raw` [processors]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/processors/). + +Peer Forwarder groups events based on the identification keys provided by the supported processors. For `service_map_stateful` and `otel_traces_raw`, the identification key is `traceId` by default and cannot be configured. The `aggregate` processor is configured using the `identification_keys` configuration option. From here, you can specify which keys to use for Peer Forwarder. See [Aggregate Processor page](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#identification_keys) for more information about identification keys. + +Peer discovery allows Data Prepper to find other nodes that it will communicate with. Currently, peer discovery is provided by a static list, a DNS record lookup, or AWS Cloud Map. + +## Discovery modes + +The following sections provide information about discovery modes. + +### Static + +Static discovery mode allows a Data Prepper node to discover nodes using a list of IP addresses or domain names. See the following YAML file for an example of static discovery mode: + +```yaml +peer_forwarder:4 + discovery_mode: static + static_endpoints: ["data-prepper1", "data-prepper2"] +``` + +### DNS lookup + +DNS discovery is preferred over static discovery when scaling out a Data Prepper cluster. DNS discovery configures a DNS provider to return a list of Data Prepper hosts when given a single domain name. This list consists of a [DNS A record](https://www.cloudflare.com/learning/dns/dns-records/dns-a-record/), and a list of IP addresses of a given domain. See the following YAML file for an example of DNS lookup: + +```yaml +peer_forwarder: + discovery_mode: dns + domain_name: "data-prepper-cluster.my-domain.net" +``` + +### AWS Cloud Map + +[AWS Cloud Map](https://docs.aws.amazon.com/cloud-map/latest/dg/what-is-cloud-map.html) provides API-based service discovery as well as DNS-based service discovery. + +Peer forwarder can use the API-based service discovery in AWS Cloud Map. To support this, you must have an existing namespace configured for API instance discovery. You can create a new one by following the instructions provided by the [AWS Cloud Map documentation](https://docs.aws.amazon.com/cloud-map/latest/dg/working-with-namespaces.html). + +Your Data Prepper configuration needs to include the following: +* `aws_cloud_map_namespace_name` – Set to your AWS Cloud Map namespace name. +* `aws_cloud_map_service_name` – Set to the service name within your specified namespace. +* `aws_region` – Set to the AWS Region in which your namespace exists. +* `discovery_mode` – Set to `aws_cloud_map`. + +Your Data Prepper configuration can optionally include the following: +* `aws_cloud_map_query_parameters` – Key-value pairs are used to filter the results based on the custom attributes attached to an instance. Results include only those instances that match all of the specified key-value pairs. + +#### Example configuration + +See the following YAML file example of AWS Cloud Map configuration: + +```yaml +peer_forwarder: + discovery_mode: aws_cloud_map + aws_cloud_map_namespace_name: "my-namespace" + aws_cloud_map_service_name: "data-prepper-cluster" + aws_cloud_map_query_parameters: + instance_type: "r5.xlarge" + aws_region: "us-east-1" +``` + +### IAM policy with necessary permissions + +Data Prepper must also be running with the necessary permissions. The following AWS Identity and Access Management (IAM) policy shows the necessary permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CloudMapPeerForwarder", + "Effect": "Allow", + "Action": "servicediscovery:DiscoverInstances", + "Resource": "*" + } + ] +} +``` + + +## Configuration + +The following table provides optional configuration values. + + +| Value | Type | Description | +| ---- | --- | ----------- | +| `port` | Integer | A value between 0 and 65535 that represents the port that the peer forwarder server is running on. Default value is `4994`. | +| `request_timeout` | Integer | Represents the request timeout duration in milliseconds for the peer forwarder HTTP server. Default value is `10000`. | +| `server_thread_count` | Integer | Represents the number of threads used by the peer forwarder server. Default value is `200`.| +| `client_thread_count` | Integer | Represents the number of threads used by the peer forwarder client. Default value is `200`.| +| `maxConnectionCount` | Integer | Represents the maximum number of open connections for the peer forwarder server. Default value is `500`. | +| `discovery_mode` | String | Represents the peer discovery mode to be used. Allowable values are `local_node`, `static`, `dns`, and `aws_cloud_map`. Defaults to `local_node`, which processes events locally. | +| `static_endpoints` | List | Contains the endpoints of all Data Prepper instances. Required if `discovery_mode` is set to `static`. | +| `domain_name` | String | Represents the single domain name to query DNS against. Typically used by creating multiple [DNS A records](https://www.cloudflare.com/learning/dns/dns-records/dns-a-record/) for the same domain. Required if `discovery_mode` is set to `dns`. | +| `aws_cloud_map_namespace_name` | String | Represents the AWS Cloud Map namespace when using AWS Cloud Map service discovery. Required if `discovery_mode` is set to `aws_cloud_map`. | +| `aws_cloud_map_service_name` | String | Represents the AWS Cloud Map service when using AWS Cloud Map service discovery. Required if `discovery_mode` is set to `aws_cloud_map`. | +| `aws_cloud_map_query_parameters` | Map | Key-value pairs used to filter the results based on the custom attributes attached to an instance. Only instances that match all the specified key-value pairs are returned. | +| `buffer_size` | Integer | Represents the maximum number of unchecked records the buffer accepts (the number of unchecked records equals the number of records written into the buffer plus the number of records that are still processing and not yet checked by the Checkpointing API). Default is `512`. | +| `batch_size` | Integer | Represents the maximum number of records that the buffer returns on read. Default is `48`. | +| `aws_region` | String | Represents the AWS Region that uses `ACM`, `Amazon S3`, or `AWS Cloud Map` and is required when any of the following conditions are met:
- The `use_acm_certificate_for_ssl` setting is set to `true`.
- Either `ssl_certificate_file` or `ssl_key_file` specifies an Amazon Simple Storage Service (Amazon S3) URI (for example, s3://mybucket/path/to/public.cert).
- The `discovery_mode` is set to `aws_cloud_map`. | +| `drain_timeout` | Duration | Represents the amount of time that peer forwarder will wait to complete data processing before shutdown. | + +## SSL configuration + +The following table provides optional SSL configuration values that allow you to set up a trust manager for the peer forwarder client in order to connect to other Data Prepper instances. + +| Value | Type | Description | +| ----- | ---- | ----------- | +| `ssl` | Boolean | Enables TLS/SSL. Default value is `true`. | +| `ssl_certificate_file`| String | Represents the SSL certificate chain file path or Amazon S3 path. The following is an example of an Amazon S3 path: `s3:///`. Defaults to the default certificate file,`config/default_certificate.pem`. See [Default Certificates](https://github.com/opensearch-project/data-prepper/tree/main/examples/certificates) for more information about how the certificate is generated. | +| `ssl_key_file`| String | Represents the SSL key file path or Amazon S3 path. Amazon S3 path example: `s3:///`. Defaults to `config/default_private_key.pem` which is the default private key file. See [Default Certificates](https://github.com/opensearch-project/data-prepper/tree/main/examples/certificates) for more information about how the private key file is generated. | +| `ssl_insecure_disable_verification` | Boolean | Disables the verification of the server's TLS certificate chain. Default value is `false`. | +| `ssl_fingerprint_verification_only` | Boolean | Disables the verification of the server's TLS certificate chain and instead verifies only the certificate fingerprint. Default value is `false`. | +| `use_acm_certificate_for_ssl` | Boolean | Enables TLS/SSL using the certificate and private key from AWS Certificate Manager (ACM). Default value is `false`. | +| `acm_certificate_arn`| String | Represents the ACM certificate Amazon Resource Name (ARN). The ACM certificate takes precedence over Amazon S3 or the local file system certificate. Required if `use_acm_certificate_for_ssl` is set to `true`. | +| `acm_private_key_password` | String | Represents the ACM private key password that will be used to decrypt the private key. If it's not provided, a random password will be generated. | +| `acm_certificate_timeout_millis` | Integer | Represents the timeout in milliseconds required for ACM to get certificates. Default value is `120000`. | +| `aws_region` | String | Represents the AWS Region that uses ACM, Amazon S3, or AWS Cloud Map. Required if `use_acm_certificate_for_ssl` is set to `true` or `ssl_certificate_file`. Also required when the `ssl_key_file` is set to use the Amazon S3 path or if `discovery_mode` is set to `aws_cloud_map`. | + +#### Example configuration + +The following YAML file provides an example configuration: + +```yaml +peer_forwarder: + ssl: true + ssl_certificate_file: "" + ssl_key_file: "" +``` + +## Authentication + +`Authentication` is optional and is a `Map` that enables mutual TLS (mTLS). It can either be `mutual_tls` or `unauthenticated`. The default value is `unauthenticated`. The following YAML file provides an example of authentication: + +```yaml +peer_forwarder: + authentication: + mutual_tls: +``` + +## Metrics + +Core peer forwarder introduces the following custom metrics. All the metrics are prefixed by `core.peerForwarder`. + +### Timer + +Peer forwarder's timer capability provides the following information: + +- `requestForwardingLatency`: Measures latency of requests forwarded by the peer forwarder client. +- `requestProcessingLatency`: Measures latency of requests processed by the peer forwarder server. + +### Counter + +The following table provides counter metric options. + +| Value | Description | +| ----- | ----------- | +| `requests`| Measures the total number of forwarded requests. | +| `requestsFailed`| Measures the total number of failed requests. Applies to requests with an HTTP response code other than `200`. | +| `requestsSuccessful`| Measures the total number of successful requests. Applies to requests with HTTP response code `200`. | +| `requestsTooLarge`| Measures the total number of requests that are too large to be written to the peer forwarder buffer. Applies to requests with HTTP response code `413`. | +| `requestTimeouts`| Measures the total number of requests that time out while writing content to the peer forwarder buffer. Applies to requests with HTTP response code `408`. | +| `requestsUnprocessable`| Measures the total number of requests that fail due to an unprocessable entity. Applies to requests with HTTP response code `422`. | +| `badRequests`| Measures the total number of requests with a bad request format. Applies to requests with HTTP response code `400`. | +| `recordsSuccessfullyForwarded`| Measures the total number of successfully forwarded records. | +| `recordsFailedForwarding`| Measures the total number of records that fail to be forwarded. | +| `recordsToBeForwarded` | Measures the total number of records to be forwarded. | +| `recordsToBeProcessedLocally` | Measures the total number of records to be processed locally. | +| `recordsActuallyProcessedLocally`| Measures the total number of records actually processed locally. This value is the sum of `recordsToBeProcessedLocally` and `recordsFailedForwarding`. | +| `recordsReceivedFromPeers`| Measures the total number of records received from remote peers. | + +### Gauge + +`peerEndpoints` Measures the number of dynamically discovered peer Data Prepper endpoints. For `static` mode, the size is fixed. diff --git a/_data-prepper/managing-data-prepper/source-coordination.md b/_data-prepper/managing-data-prepper/source-coordination.md new file mode 100644 index 00000000..3c60b452 --- /dev/null +++ b/_data-prepper/managing-data-prepper/source-coordination.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Source coordination +nav_order: 35 +parent: Managing Data Prepper +--- + +# Source coordination + +_Source coordination_ is the concept of coordinating and distributing work between Data Prepper data sources in a multi-node environment. Some data sources, such as Amazon Kinesis or Amazon Simple Queue Service (Amazon SQS), handle coordination natively. Other data sources, such as OpenSearch, Amazon Simple Storage Service (Amazon S3), Amazon DynamoDB, and JDBC/ODBC, do not support source coordination. + +Data Prepper source coordination decides which partition of work is performed by each node in the Data Prepper cluster and prevents duplicate partitions of work. + +Inspired by the [Kinesis Client Library](https://docs.aws.amazon.com/streams/latest/dev/shared-throughput-kcl-consumers.html), Data Prepper utilizes a distributed store in the form of a lease to handle the distribution and deduplication of work. + +## Formatting partitions + +Source coordination separates sources into "partitions of work." For example, an S3 object would be a partition of work for Amazon S3, or an OpenSearch index would be a partition of work for OpenSearch. + +Data Prepper takes each partition of work that is chosen by the source and creates corresponding items in the distributed store that Data Prepper uses for source coordination. Each of these items has the following standard format, which can be extended by the distributed store implementation. + +| Value | Type | Description | +| :--- | :--- | :--- | +| `sourceIdentifier` | String | The identifier for which the Data Prepper pipeline works on this partition. By default, the `sourceIdentifier` is prefixed by the sub-pipeline name, but an additional prefix can be configured with `partition_prefix` in your data-prepper-config.yaml file. | +| `sourcePartitionKey` | String | The identifier for the partition of work associated with this item. For example, for an `s3` source with scan capabilities, this identifier is the S3 bucket's `objectKey` combination. +| `partitionOwner` | String | An identifier for the node that actively owns and is working on this partition. This ID contains the hostname of the node but is `null` when this partition is not owned. | +| `partitionProgressState` | String | A JSON string object representing the progress made on a partition of work or any additional metadata that may be needed by the source in the case of another node resuming where the last node stopped during a crash. | +| `partitionOwnershipTimeout` | Timestamp | Whenever a Data Prepper node acquires a partition, a 10-minute timeout is given to the owner of the partition to handle the event of a node crashing. The ownership is renewed with another 10 minutes when the owner saves the state of the partition. | +| `sourcePartitionStatus` | Enum | Represents the current state of the partition: `ASSIGNED` means the partition is currently being processed, `UNASSIGNED` means the partition is waiting to be processed, `CLOSED` means the partition is waiting to be processed at a later date, and `COMPLETED` means the partition has already been processed. | +| `reOpenAt` | Timestamp | Represents the time at which CLOSED partitions reopen and are considered to be available for processing. Only applies to CLOSED partitions. | +| `closedCount` | Long | Tracks how many times the partition has been marked as `CLOSED`.| + + +## Acquiring partitions + +Partitions are acquired in the order that they are returned in the `List` provided by the source. When a node attempts to acquire a partition, Data Prepper performs the following steps: + +1. Data Prepper queries the `ASSIGNED` partitions to check whether any `ASSIGNED` partitions have expired partition owners. This is intended to assign priority to partitions that have had nodes crash in the middle of processing, which can allow for using a partition state that may be time sensitive. +2. After querying `ASSIGNED` partitions, Data Prepper queries the `CLOSED` partitions to determine whether any of the partition's `reOpenAt` timestamps have been reached. +3. If there are no `ASSIGNED` or `CLOSED` partitions available, then Data Prepper queries the `UNASSIGNED` partitions until on of these partitions is `ASSIGNED`. + +If this flow occurs and no partition is acquired by the node, then the partition supplier function provided in the `getNextPartition` method of `SourceCoordinator` will create new partitions. After the supplier function completes, Data Prepper again queries the partitions for `ASSIGNED`, `CLOSED`, and `UNASSIGNED`. + +## Global state + +Any function that is passed to the `getNextPartition` method creates new partitions with a global state of `Map`. This state is shared between all of the nodes in the cluster and will only be run by a single node at a time, as determined by the source. + +## Configuration + +The following table provide optional configuration values for `source_coordination`. + +| Value | Type | Description | +| :--- | :--- | :--- | +| `partition_prefix` | String | A prefix to the `sourceIdentifier` used to differentiate between Data Prepper clusters that share the same distributed store. | +| `store` | Object | The object that comprises the configuration for the store to be used, where the key is the name of the store, such as `in_memory` or `dynamodb`, and the value is any configuration available on that store type. | + +### Supported stores +As of Data Prepper 2.4, only `in_memory` and `dynamodb` stores are supported: + +- The `in_memory` store is the +default when no `source_coordination` settings are configured in the `data-prepper-config.yaml` file and should only be used for single-node configurations. +- The `dynamodb` store is used for multi-node Data Prepper environments. The `dynamodb` store can be shared between one or more Data Prepper clusters that need to utilize source coordination. + +#### DynamoDB store + +Data Prepper will attempt to create the `dynamodb` table on startup unless the `skip_table_creation` flag is configured to `true`. Optionally, you can configure the [time-to-live](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/TTL.html) (`ttl`) on the table, which results in the store cleaning up items over time. Some sources rely on source coordination for the deduplication of data, so be sure to configure a large enough `ttl` for the pipeline duration. + +If `ttl` is not configured on the table, any items no longer needed in the table must be cleaned manually. + +The following shows the full set of permissions needed for Data Prepper to create the table, enable `ttl`, and interact with the table: + +```json +{ + "Sid": "ReadWriteSourceCoordinationDynamoStore", + "Effect": "Allow", + "Action": [ + "dynamodb:DescribeTimeToLive", + "dynamodb:UpdateTimeToLive", + "dynamodb:DescribeTable", + "dynamodb:CreateTable", + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:Query" + ], + "Resource": [ + "arn:aws:dynamodb:${REGION}:${AWS_ACCOUNT_ID}:table/${TABLE_NAME}", + "arn:aws:dynamodb:${REGION}:${AWS_ACCOUNT_ID}:table/${TABLE_NAME}/index/source-status" + ] +} +``` + + +| Value | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| `table_name` | Yes | String | The name of the table to be used for source coordination. | +| `region` | Yes | String | The region of the DynamoDB table. | +| `sts_role_arn` | No | String | The `sts` role that contains the table permissions. Uses default credentials when not provided. | +| `sts_external_id` | No | String | The external ID used in the API call to assume the `sts_role_arn`. | +| `skip_table_creation` | No | Boolean | If set to `true` when using an existing store, the attempt to create the store is skipped. Default is `false`. | +| `provisioned_write_capacity_units` | No | Integer | The number of write capacity units to configure on the table. Default is `10`. | +| `provisioned_read_capacity_units` | No | Integer | The number of read capacity units to configure on the table. Default is `10`. | +| `ttl` | Duration | Optional. The duration of the TTL for the items in the table. The TTL is extended by this duration when an update is made to the item. Defaults to no TTL being used on the table. | + +The following example shows a `dynamodb` store: + +```yaml +source_coordination: + store: + dynamodb: + table_name: "DataPrepperSourceCoordinationStore" + region: "us-east-1" + sts_role_arn: "arn:aws:iam::##########:role/SourceCoordinationTableRole" + ttl: "P7D" + skip_table_creation: true +``` + +#### In-memory store (default) + +The following example shows an `in_memory` store, which is best used with a single-node cluster: + + +```yaml +source_coordination: + store: + in_memory: +``` + + +## Metrics + +Source coordination metrics are interpreted differently depending on which source is configured. The format of a source coordination metric is `_source_coordinator_`. You can use the sub-pipeline name to identify the source for these metrics because each sub-pipeline is unique to each source. + +### Progress metrics + +The following are metrics related to partition progress: + +* `partitionsCreatedCount`: The number of partition items that have been created. For an S3 scan, this is the number of objects that have had partitions created for them. +* `partitionsCompleted`: The number of partitions that have been fully processed and marked as `COMPLETED`. For an S3 scan, this is the number of objects that have been processed. +* `noPartitionsAcquired`: The number of times that a node has attempted to acquire a partition on which to perform work but has found no available partitions in the store. Use this to indicate that there is no more data coming into the source. +* `partitionsAcquired`: The number of partitions that have been acquired by nodes on which to perform work. In non-error scenarios, this should be equal to the number of partitions created. +* `partitionsClosed`: The number of partitions that have been marked as `CLOSED`. This is only applicable to sources that use the CLOSED functionality. + +The following are metrics related to partition errors: + +* `partitionNotFoundErrors`: Indicates that a partition item that is actively owned by a node does not have a corresponding store item. This should only occur if an item in the table has been manually deleted. +* `partitionNotOwnedErrors`: Indicates that a node that owns a partition has lost ownership due to the partition ownership timeout expiring. Unless the source is able to checkpoint the partition with `saveState`, this error results in duplicate item processing. +* `partitionUpdateErrors`: The number of errors that were received when an update to the store for this partition item failed. Is prefixed with either `saveState`, `close`, or `complete` to indicate which update action is failing. + diff --git a/_data-prepper/migrate-open-distro.md b/_data-prepper/migrate-open-distro.md new file mode 100644 index 00000000..e7fdacbd --- /dev/null +++ b/_data-prepper/migrate-open-distro.md @@ -0,0 +1,24 @@ +--- +layout: default +title: Migrating from Open Distro +nav_order: 30 +--- + +# Migrating from Open Distro + +Existing users can migrate from the Open Distro Data Prepper to OpenSearch Data Prepper. Beginning with Data Prepper version 1.1, there is only one distribution of OpenSearch Data Prepper. + +## Change your pipeline configuration + +The `elasticsearch` sink has changed to `opensearch`. Therefore, change your existing pipeline to use the `opensearch` plugin instead of `elasticsearch`. + +While the Data Prepper plugin is titled `opensearch`, it remains compatible with Open Distro and ElasticSearch 7.x. +{: .note} + +## Update Docker image + +In your Data Prepper Docker configuration, adjust `amazon/opendistro-for-elasticsearch-data-prepper` to `opensearchproject/data-prepper`. This change will download the latest Data Prepper Docker image. + +## Next steps + +For more information about Data Prepper configurations, see [Getting Started with Data Prepper]({{site.url}}{{site.baseurl}}/clients/data-prepper/get-started/). diff --git a/_data-prepper/migrating-from-logstash-data-prepper.md b/_data-prepper/migrating-from-logstash-data-prepper.md new file mode 100644 index 00000000..f87ca8d6 --- /dev/null +++ b/_data-prepper/migrating-from-logstash-data-prepper.md @@ -0,0 +1,47 @@ +--- +layout: default +title: Migrating from Logstash +nav_order: 25 +redirect_from: + - /data-prepper/configure-logstash-data-prepper/ +--- + +# Migrating from Logstash + +You can run Data Prepper with a Logstash configuration. + +As mentioned in [Getting started with Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/), you'll need to configure Data Prepper with a pipeline using a `pipelines.yaml` file. + +Alternatively, if you have a Logstash configuration `logstash.conf` to configure Data Prepper instead of `pipelines.yaml`. + +## Supported plugins + +As of the Data Prepper 1.2 release, the following plugins from the Logstash configuration are supported: +* HTTP Input plugin +* Grok Filter plugin +* Elasticsearch Output plugin +* Amazon Elasticsearch Output plugin + +## Limitations +* Apart from the supported plugins, all other plugins from the Logstash configuration will throw an `Exception` and fail to run. +* Conditionals in the Logstash configuration are not supported as of the Data Prepper 1.2 release. + +## Running Data Prepper with a Logstash configuration + +1. To install Data Prepper's Docker image, see Installing Data Prepper in [Getting Started with Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started#1-installing-data-prepper). + +2. Run the Docker image installed in Step 1 by supplying your `logstash.conf` configuration. + +``` +docker run --name data-prepper -p 4900:4900 -v ${PWD}/logstash.conf:/usr/share/data-prepper/pipelines.conf opensearchproject/data-prepper:latest pipelines.conf +``` + +The `logstash.conf` file is converted to `logstash.yaml` by mapping the plugins and attributes in the Logstash configuration to the corresponding plugins and attributes in Data Prepper. +You can find the converted `logstash.yaml` file in the same directory where you stored `logstash.conf`. + + +The following output in your terminal indicates that Data Prepper is running correctly: + +``` +INFO org.opensearch.dataprepper.pipeline.ProcessWorker - log-pipeline Worker: No records received from buffer +``` diff --git a/_data-prepper/pipelines/configuration/buffers/bounded-blocking.md b/_data-prepper/pipelines/configuration/buffers/bounded-blocking.md new file mode 100644 index 00000000..8370cf22 --- /dev/null +++ b/_data-prepper/pipelines/configuration/buffers/bounded-blocking.md @@ -0,0 +1,26 @@ +--- +layout: default +title: Bounded blocking +parent: Buffers +grand_parent: Pipelines +nav_order: 50 +--- + +# Bounded blocking + +## Overview + +`Bounded blocking` is the default buffer and is memory based. The following table describes the `Bounded blocking` parameters. + +| Option | Required | Type | Description | +| --- | --- | --- | --- | +| buffer_size | No | Integer | The maximum number of records the buffer accepts. Default value is `12800`. | +| batch_size | No | Integer | The maximum number of records the buffer drains after each read. Default value is `200`. | + + \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/buffers/buffers.md b/_data-prepper/pipelines/configuration/buffers/buffers.md new file mode 100644 index 00000000..eeb68260 --- /dev/null +++ b/_data-prepper/pipelines/configuration/buffers/buffers.md @@ -0,0 +1,11 @@ +--- +layout: default +title: Buffers +parent: Pipelines +has_children: true +nav_order: 20 +--- + +# Buffers + +Buffers store data as it passes through the pipeline. If you implement a custom buffer, it can be memory based, which provides better performance, or disk based, which is larger in size. \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/buffers/kafka.md b/_data-prepper/pipelines/configuration/buffers/kafka.md new file mode 100644 index 00000000..675a0c97 --- /dev/null +++ b/_data-prepper/pipelines/configuration/buffers/kafka.md @@ -0,0 +1,144 @@ +--- +layout: default +title: kafka +parent: Buffers +grand_parent: Pipelines +nav_order: 80 +--- + +# kafka + +The `kafka` buffer buffers data into an Apache Kafka topic. It uses the Kafka topic to persist data while the data is in transit. + +The following example shows how to run the Kafka buffer in an HTTP pipeline. +It runs against a locally running Kafka cluster. + +``` +kafka-buffer-pipeline: + source: + http: + buffer: + kafka: + bootstrap_servers: ["localhost:9092"] + encryption: + type: none + topics: + - name: my-buffer-topic + group_id: data-prepper + create_topic: true + processor: + - grok: + match: + message: [ "%{COMMONAPACHELOG}" ] + sink: + - stdout: +``` + +## Configuration options + +Use the following configuration options with the `kafka` buffer. + + +Option | Required | Type | Description +--- | --- | --- | --- +`bootstrap_servers` | Yes | String list | The host and port for the initial connection to the Kafka cluster. You can configure multiple Kafka brokers by using the IP address or the port number for each broker. When using [Amazon Managed Streaming for Apache Kafka (Amazon MSK)](https://aws.amazon.com/msk/) as your Kafka cluster, the bootstrap server information is obtained from Amazon MSK using the Amazon Resource Name (ARN) provided in the configuration. +`topics` | Yes | List | A list of [topics](#topic) to use. You must supply one topic per buffer. +`authentication` | No | [Authentication](#authentication) | Sets the authentication options for both the pipeline and Kafka. For more information, see [Authentication](#authentication). +`encryption` | No | [Encryption](#encryption) | The encryption configuration for encryption in transit. For more information, see [Encryption](#encryption). +`aws` | No | [AWS](#aws) | The AWS configuration. For more information, see [aws](#aws). + + +### topic + +The `topic` option configures a single Kafka topic and tells the `kafka` buffer how to use that topic. + + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`name` | Yes | String | The name of the Kafka topic. +`group_id` | Yes | String | Sets Kafka's `group.id` option. +`workers` | No | Integer | The number of multithreaded consumers associated with each topic. Default is `2`. The maximum value is `200`. +`encryption_key` | No | String | An Advanced Encryption Standard (AES) encryption key used to encrypt and decrypt data within Data Prepper before sending it to Kafka. This value must be plain text or encrypted using AWS Key Management Service (AWS KMS). +`kms` | No | AWS KMS key | When configured, uses an AWS KMS key to encrypt data. See [`kms`](#kms) for more information. +`auto_commit` | No | Boolean | When `false`, the consumer offset will not be periodically committed to Kafka in the background. Default is `false`. +`commit_interval` | No | Integer | When `auto_commit` is set to `true`, sets how often, in seconds, the consumer offsets are auto-committed to Kafka through Kafka's `auto.commit.interval.ms` option. Default is `5s`. +`session_timeout` | No | Integer | The amount of time during which the source detects client failures when using Kafka's group management features, which can be used to balance the data stream. Default is `45s`. +`auto_offset_reset` | No | String | Automatically resets the offset to the earliest or the latest offset through Kafka's `auto.offset.reset` option. Default is `latest`. +`thread_waiting_time` | No | Integer | The amount of time that a thread waits for the preceding thread to complete its task and to signal the next thread. The Kafka consumer API poll timeout value is set to half of this setting. Default is `5s`. +`max_partition_fetch_bytes` | No | Integer | Sets the maximum limit, in megabytes, for data returns from each partition through Kafka's `max.partition.fetch.bytes` setting. Default is `1mb`. +`heart_beat_interval` | No | Integer | The expected amount of time between heartbeats to the consumer coordinator when using Kafka's group management facilities through Kafka's `heartbeat.interval.ms` setting. Default is `5s`. +`fetch_max_wait` | No | Integer | The maximum amount of time during which the server blocks a fetch request when there isn't sufficient data to satisfy the `fetch_min_bytes` requirement through Kafka's `fetch.max.wait.ms` setting. Default is `500ms`. +`fetch_max_bytes` | No | Integer | The maximum record size accepted by the broker through Kafka's `fetch.max.bytes` setting. Default is `50mb`. +`fetch_min_bytes` | No | Integer | The minimum amount of data the server returns during a fetch request through Kafka's `retry.backoff.ms` setting. Default is `1b`. +`retry_backoff` | No | Integer | The amount of time to wait before attempting to retry a failed request to a given topic partition. Default is `10s`. +`max_poll_interval` | No | Integer | The maximum delay between invocations of a `poll()` when using group management through Kafka's `max.poll.interval.ms` option. Default is `300s`. +`consumer_max_poll_records` | No | Integer | The maximum number of records returned in a single `poll()` call through Kafka's `max.poll.records` setting. Default is `500`. + + +### kms + +When using AWS KMS, the AWS KMS key can decrypt the `encryption_key` so that it is not stored in plain text. To configure AWS KMS with the `kafka` buffer, use the following options. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`key_id` | Yes | String | The ID of the AWS KMS key. It may be the full key ARN or a key alias. +`region` | No | String | The AWS Region of the AWS KMS key. +`sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role ARN to use to access the AWS KMS key. +`encryption_context` | No | Map | When provided, messages sent to the topic will include this map as an AWS KMS encryption context. + + +### Authentication + +The following option is required inside the `authentication` object. + +Option | Type | Description +:--- | :--- | :--- +`sasl` | JSON object | The Simple Authentication and Security Layer (SASL) authentication configuration. + + +### SASL + +Use one of the following options when configuring SASL authentication. + +Option | Type | Description +:--- | :--- | :--- +`plaintext` | JSON object | The [PLAINTEXT](#sasl-plaintext) authentication configuration. +`aws_msk_iam` | String | The Amazon MSK AWS Identity and Access Management (IAM) configuration. If set to `role`, the `sts_role_arn` set in the `aws` configuration is used. Default is `default`. + +#### SASL PLAINTEXT + +The following options are required when using the [SASL PLAINTEXT](https://kafka.apache.org/10/javadoc/org/apache/kafka/common/security/auth/SecurityProtocol.html) protocol. + +Option | Type | Description +:--- | :--- | :--- +`username` | String | The username for the PLAINTEXT authentication. +`password` | String | The password for the PLAINTEXT authentication. + +#### Encryption + +Use the following options when setting SSL encryption. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`type` | No | String | The encryption type. Use `none` to disable encryption. Default is `ssl`. +`insecure` | No | Boolean | A Boolean flag used to turn off SSL certificate verification. If set to `true`, certificate authority (CA) certificate verification is turned off and insecure HTTP requests are sent. Default is `false`. + + +#### aws + +Use the following options when setting up authentication for `aws` services. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`region` | No | String | The AWS Region to use for credentials. Defaults to the [standard SDK behavior for determining the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | The AWS STS role to assume for requests to Amazon Simple Queue Service (Amazon SQS) and Amazon Simple Storage Service (Amazon S3). Default is `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`msk` | No | JSON object | The [Amazon MSK](#msk) configuration settings. + +#### msk + +Use the following options inside the `msk` object. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`arn` | Yes | String | The [Amazon MSK ARN](https://docs.aws.amazon.com/msk/1.0/apireference/configurations-arn.html) to use. +`broker_connection_type` No | String | The type of connector to use with the Amazon MSK broker, either `public`, `single_vpc`, or `multi_vpc`. Default is `single_vpc`. diff --git a/_data-prepper/pipelines/configuration/processors/add-entries.md b/_data-prepper/pipelines/configuration/processors/add-entries.md new file mode 100644 index 00000000..23e4772f --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/add-entries.md @@ -0,0 +1,64 @@ +--- +layout: default +title: add_entries +parent: Processors +grand_parent: Pipelines +nav_order: 40 +--- + +# add_entries + +The `add_entries` processor adds entries to an event. + +### Configuration + +You can configure the `add_entries` processor with the following options. + +| Option | Required | Description | +| :--- | :--- | :--- | +| `entries` | Yes | A list of entries to add to an event. | +| `key` | Yes | The key of the new entry to be added. Some examples of keys include `my_key`, `myKey`, and `object/sub_Key`. | +| `metadata_key` | Yes | The key for the new metadata attribute. The argument must be a literal string key and not a JSON Pointer. Either one string key or `metadata_key` is required. | +| `format` | No | A format string to use as the value of the new entry, for example, `${key1}-${key2}`, where `key1` and `key2` are existing keys in the event. Required if neither `value` nor `value_expression` is specified. | +| `value_expression` | No | An expression string to use as the value of the new entry. For example, `/key` is an existing key in the event with a type of either a number, a string, or a Boolean. Expressions can also contain functions returning number/string/integer. For example, `length(/key)` will return the length of the key in the event when the key is a string. For more information about keys, see [Expression syntax](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/). | +| `add_when` | No | A [conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that will be evaluated to determine whether the processor will be run on the event. | +| `value` | Yes | The value of the new entry to be added. You can use the following data types: strings, Booleans, numbers, null, nested objects, and arrays. | +| `overwrite_if_key_exists` | No | When set to `true`, the existing value is overwritten if `key` already exists in the event. The default value is `false`. | + +### Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + ... + .... + processor: + - add_entries: + entries: + - key: "newMessage" + value: 3 + overwrite_if_key_exists: true + - metadata_key: myMetadataKey + value_expression: 'length("newMessage")' + add_when: '/some_key == "test"' + sink: +``` +{% include copy.html %} + + +For example, when your source contains the following event record: + +```json +{"message": "hello"} +``` + +And then you run the `add_entries` processor using the example pipeline, it adds a new entry, `{"newMessage": 3}`, to the existing event, `{"message": "hello"}`, so that the new event contains two entries in the final output: + +```json +{"message": "hello", "newMessage": 3} +``` + +> If `newMessage` already exists, its existing value is overwritten with a value of `3`. + diff --git a/_data-prepper/pipelines/configuration/processors/aggregate.md b/_data-prepper/pipelines/configuration/processors/aggregate.md new file mode 100644 index 00000000..699d2502 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/aggregate.md @@ -0,0 +1,179 @@ +--- +layout: default +title: aggregate +parent: Processors +grand_parent: Pipelines +nav_order: 41 +--- + +# aggregate + +The `aggregate` processor groups events based on the values of `identification_keys`. Then, the processor performs an action on each group, helping reduce unnecessary log volume and creating aggregated logs over time. You can use existing actions or create your own custom aggregations using Java code. + + +## Configuration + +The following table describes the options you can use to configure the `aggregate` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +identification_keys | Yes | List | An unordered list by which to group events. Events with the same values as these keys are put into the same group. If an event does not contain one of the `identification_keys`, then the value of that key is considered to be equal to `null`. At least one identification_key is required (for example, `["sourceIp", "destinationIp", "port"]`). +action | Yes | AggregateAction | The action to be performed on each group. One of the [available aggregate actions](#available-aggregate-actions) must be provided, or you can create custom aggregate actions. `remove_duplicates` and `put_all` are the available actions. For more information, see [Creating New Aggregate Actions](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#creating-new-aggregate-actions). +group_duration | No | String | The amount of time that a group should exist before it is concluded automatically. Supports ISO_8601 notation strings ("PT20.345S", "PT15M", etc.) as well as simple notation for seconds (`"60s"`) and milliseconds (`"1500ms"`). Default value is `180s`. + +## Available aggregate actions + +Use the following aggregate actions to determine how the `aggregate` processor processes events in each group. + +### remove_duplicates + +The `remove_duplicates` action processes the first event for a group immediately and drops any events that duplicate the first event from the source. For example, when using `identification_keys: ["sourceIp", "destination_ip"]`: + +1. The `remove_duplicates` action processes `{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200 }`, the first event in the source. +2. Data Prepper drops the `{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 1000 }` event because the `sourceIp` and `destinationIp` match the first event in the source. +3. The `remove_duplicates` action processes the next event, `{ "sourceIp": "127.0.0.2", "destinationIp": "192.168.0.1", "bytes": 1000 }`. Because the `sourceIp` is different from the first event of the group, Data Prepper creates a new group based on the event. + +### put_all + +The `put_all` action combines events belonging to the same group by overwriting existing keys and adding new keys, similarly to the Java `Map.putAll`. The action drops all events that make up the combined event. For example, when using `identification_keys: ["sourceIp", "destination_ip"]`, the `put_all` action processes the following three events: + +``` +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 1000 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "http_verb": "GET" } +``` + +Then the action combines the events into one. The pipeline then uses the following combined event: + +``` +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200, "bytes": 1000, "http_verb": "GET" } +``` + +### count + +The `count` event counts events that belong to the same group and generates a new event with values of the `identification_keys` and the count, which indicates the number of new events. You can customize the processor with the following configuration options: + + + * `count_key`: Key used for storing the count. Default name is `aggr._count`. +* `start_time_key`: Key used for storing the start time. Default name is `aggr._start_time`. +* `output_format`: Format of the aggregated event. + * `otel_metrics`: Default output format. Outputs in OTel metrics SUM type with count as value. + * `raw` - Generates a JSON object with the `count_key` field as a count value and the `start_time_key` field with aggregation start time as value. + +For an example, when using `identification_keys: ["sourceIp", "destination_ip"]`, the `count` action counts and processes the following events: + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 503 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 400 } +``` + +The processor creates the following event: + +```json +{"isMonotonic":true,"unit":"1","aggregationTemporality":"AGGREGATION_TEMPORALITY_DELTA","kind":"SUM","name":"count","description":"Number of events","startTime":"2022-12-02T19:29:51.245358486Z","time":"2022-12-02T19:30:15.247799684Z","value":3.0,"sourceIp":"127.0.0.1","destinationIp":"192.168.0.1"} +``` + +### histogram + +The `histogram` action aggregates events belonging to the same group and generates a new event with values of the `identification_keys` and histogram of the aggregated events based on a configured `key`. The histogram contains the number of events, sum, buckets, bucket counts, and optionally min and max of the values corresponding to the `key`. The action drops all events that make up the combined event. + +You can customize the processor with the following configuration options: + +* `key`: Name of the field in the events the histogram generates. +* `generated_key_prefix`: `key_prefix` used by all the fields created in the aggregated event. Having a prefix ensures that the names of the histogram event do not conflict with the field names in the event. +* `units`: The units for the values in the `key`. +* `record_minmax`: A Boolean value indicating whether the histogram should include the min and max of the values in the aggregation. +* `buckets`: A list of buckets (values of type `double`) indicating the buckets in the histogram. +* `output_format`: Format of the aggregated event. + * `otel_metrics`: Default output format. Outputs in OTel metrics SUM type with count as value. + * `raw`: Generates a JSON object with `count_key` field with count as value and `start_time_key` field with aggregation start time as value. + + +For example, when using `identification_keys: ["sourceIp", "destination_ip", "request"]`, `key: latency`, and `buckets: [0.0, 0.25, 0.5]`, the `histogram` action processes the following events: + +``` +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "request" : "/index.html", "latency": 0.2 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "request" : "/index.html", "latency": 0.55} +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "request" : "/index.html", "latency": 0.25 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "request" : "/index.html", "latency": 0.15 } +``` + +Then the processor creates the following event: + +```json +{"max":0.55,"kind":"HISTOGRAM","buckets":[{"min":-3.4028234663852886E38,"max":0.0,"count":0},{"min":0.0,"max":0.25,"count":2},{"min":0.25,"max":0.50,"count":1},{"min":0.50,"max":3.4028234663852886E38,"count":1}],"count":4,"bucketCountsList":[0,2,1,1],"description":"Histogram of latency in the events","sum":1.15,"unit":"seconds","aggregationTemporality":"AGGREGATION_TEMPORALITY_DELTA","min":0.15,"bucketCounts":4,"name":"histogram","startTime":"2022-12-14T06:43:40.848762215Z","explicitBoundsCount":3,"time":"2022-12-14T06:44:04.852564623Z","explicitBounds":[0.0,0.25,0.5],"request":"/index.html","sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "key": "latency"} +``` + +### rate_limiter + +The `rate_limiter` action controls the number of events aggregated per second. By default, `rate_limiter` blocks the `aggregate` processor from running if it receives more events than the configured number allowed. You can overwrite the number events that triggers the `rate_limited` by using the `when_exceeds` configuration option. + +You can customize the processor with the following configuration options: + +* `events_per_second`: The number of events allowed per second. +* `when_exceeds`: Indicates what action the `rate_limiter` takes when the number of events received is greater than the number of events allowed per second. Default value is `block`, which blocks the processor from running after the maximum number of events allowed per second is reached until the next second. Alternatively, the `drop` option drops the excess events received in that second. + +For example, if `events_per_second` is set to `1` and `when_exceeds` is set to `drop`, the action tries to process the following events when received during the one second time interval: + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 1000 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "http_verb": "GET" } +``` + +The following event is processed, but all other events are ignored because the `rate_limiter` blocks them: + +```json +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200 } +``` + +If `when_exceeds` is set to `drop`, all three events are processed. + +### percent_sampler + +The `percent_sampler` action controls the number of events aggregated based on a percentage of events. The action drops any events not included in the percentage. + +You can set the percentage of events using the `percent` configuration, which indicates the percentage of events processed during a one second interval (0%--100%). + +For example, if percent is set to `50`, the action tries to process the following events in the one-second interval: + +``` +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 2500 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 500 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 1000 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 3100 } +``` + +The pipeline processes 50% of the events, drops the other events, and does not generate a new event: + +``` +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 500 } +{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 3100 } +``` + +## Metrics + +The following table describes common [Abstract processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/processor/AbstractProcessor.java) metrics. + +| Metric name | Type | Description | +| ------------- | ---- | -----------| +| `recordsIn` | Counter | Metric representing the ingress of records to a pipeline component. | +| `recordsOut` | Counter | Metric representing the egress of records from a pipeline component. | +| `timeElapsed` | Timer | Metric representing the time elapsed during execution of a pipeline component. | + + +The `aggregate` processor includes the following custom metrics. + +**Counter** + +* `actionHandleEventsOut`: The number of events that have been returned from the `handleEvent` call to the configured [action](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#action). +* `actionHandleEventsDropped`: The number of events that have not been returned from the `handleEvent` call to the configured [action](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#action). +* `actionHandleEventsProcessingErrors`: The number of calls made to `handleEvent` for the configured [action](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#action) that resulted in an error. +* `actionConcludeGroupEventsOut`: The number of events that have been returned from the `concludeGroup` call to the configured [action](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#action). +* `actionConcludeGroupEventsDropped`: The number of events that have not been returned from the `condludeGroup` call to the configured [action](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#action). +* `actionConcludeGroupEventsProcessingErrors`: The number of calls made to `concludeGroup` for the configured [action](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#action) that resulted in an error. + +**Gauge** + +* `currentAggregateGroups`: The current number of groups. This gauge decreases when a group concludes and increases when an event initiates the creation of a new group. \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md new file mode 100644 index 00000000..9628bb6c --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md @@ -0,0 +1,76 @@ +--- +layout: default +title: anomaly_detector +parent: Processors +grand_parent: Pipelines +nav_order: 45 +--- + +# anomaly_detector + +The anomaly detector processor takes structured data and runs anomaly detection algorithms on fields that you can configure in that data. The data must be either an integer or a real number for the anomaly detection algorithm to detect anomalies. Deploying the aggregate processor in a pipeline before the anomaly detector processor can help you achieve the best results, as the aggregate processor automatically aggregates events by key and keeps them on the same host. For example, if you are searching for an anomaly in latencies from a specific IP address and if all the events go to the same host, then the host has more data for these events. This additional data results in better training of the machine learning (ML) algorithm, which results in better anomaly detection. + +## Configuration + +You can configure the anomaly detector processor by specifying a key and the options for the selected mode. You can use the following options to configure the anomaly detector processor. + +| Name | Required | Description | +| :--- | :--- | :--- | +| `keys` | Yes | A non-ordered `List` that is used as input to the ML algorithm to detect anomalies in the values of the keys in the list. At least one key is required. +| `mode` | Yes | The ML algorithm (or model) used to detect anomalies. You must provide a mode. See [random_cut_forest mode](#random_cut_forest-mode). +| `identification_keys` | No | If provided, anomalies will be detected within each unique instance of this key. For example, if you provide the `ip` field, anomalies will be detected separately for each unique IP address. +| `cardinality_limit` | No | If using the `identification_keys` settings, a new ML model will be created for every degree of cardinality. This can cause a large amount of memory usage, so it is helpful to set a limit on the number of models. Default limit is 5000. +| `verbose` | No | RCF will try to automatically learn and reduce the number of anomalies detected. For example, if latency is consistently between 50 and 100, and then suddenly jumps to around 1000, only the first one or two data points after the transition will be detected (unless there are other spikes/anomalies). Similarly, for repeated spikes to the same level, RCF will likely eliminate many of the spikes after a few initial ones. This is because the default setting is to minimize the number of alerts detected. Setting the `verbose` setting to `true` will cause RCF to consistently detect these repeated cases, which may be useful for detecting anomalous behavior that lasts an extended period of time. + + +### Keys + +Keys that are used in the anomaly detector processor are present in the input event. For example, if the input event is `{"key1":value1, "key2":value2, "key3":value3}`, then any of the keys (such as `key1`, `key2`, `key3`) in that input event can be used as anomaly detector keys as long as their value (such as `value1`, `value2`, `value3`) is an integer or real number. + +### random_cut_forest mode + +The random cut forest (RCF) ML algorithm is an unsupervised algorithm for detecting anomalous data points within a dataset. To detect anomalies, the anomaly detector processor uses the `random_cut_forest` mode. + +| Name | Description | +| :--- | :--- | +| `random_cut_forest` | Processes events using the RCF ML algorithm to detect anomalies. | + +RCF is an unsupervised ML algorithm for detecting anomalous data points within a dataset. Data Prepper uses RCF to detect anomalies in data by passing the values of the configured key to RCF. For example, when an event with a latency value of 11.5 is sent, the following anomaly event is generated: + + + ```json + { "latency": 11.5, "deviation_from_expected":[10.469302736820003],"grade":1.0} +``` + +In this example, `deviation_from_expected` is a list of deviations for each of the keys from their corresponding expected values, and `grade` is the anomaly grade that indicates the anomaly severity. + + +You can configure `random_cut_forest` mode with the following options. + +| Name | Default value | Range | Description | +| :--- | :--- | :--- | :--- | +| `shingle_size` | `4` | 1--60 | The shingle size used in the ML algorithm. | +| `sample_size` | `256` | 100--2500 | The sample size used in the ML algorithm. | +| `time_decay` | `0.1` | 0--1.0 | The time decay value used in the ML algorithm. Used as the mathematical expression `timeDecay` divided by `SampleSize` in the ML algorithm. | +| `type` | `metrics` | N/A | The type of data sent to the algorithm. | +| `version` | `1.0` | N/A | The algorithm version number. | + +## Usage + +To get started, create the following `pipeline.yaml` file. You can use the following pipeline configuration to look for anomalies in the `latency` field in events that are passed to the processor. Then you can use the following YAML configuration file `random_cut_forest` mode to detect anomalies: + +```yaml +ad-pipeline: + source: + ... + .... + processor: + - anomaly_detector: + keys: ["latency"] + mode: + random_cut_forest: +``` + +When you run the anomaly detector processor, the processor extracts the value for the `latency` key, and then passes the value through the RCF ML algorithm. You can configure any key that comprises integers or real numbers as values. In the following example, you can configure `bytes` or `latency` as the key for an anomaly detector. + +`{"ip":"1.2.3.4", "bytes":234234, "latency":0.2}` diff --git a/_data-prepper/pipelines/configuration/processors/convert_entry_type.md b/_data-prepper/pipelines/configuration/processors/convert_entry_type.md new file mode 100644 index 00000000..2fc9fdb9 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/convert_entry_type.md @@ -0,0 +1,51 @@ +--- +layout: default +title: convert_entry_type +parent: Processors +grand_parent: Pipelines +nav_order: 47 +--- + +# convert_entry_type + +The `convert_entry_type` processor converts a value type associated with the specified key in a event to the specified type. It is a casting processor that changes the types of some fields in events. Some data must be converted to a different type, such as an integer to a double, or a string to an integer, so that it will pass the events through condition-based processors or perform conditional routing. + +## Configuration + +You can configure the `convert_entry_type` processor with the following options. + +| Option | Required | Description | +| :--- | :--- | :--- | +| `key`| Yes | Keys whose value needs to be converted to a different type. | +| `type` | No | Target type for the key-value pair. Possible values are `integer`, `double`, `string`, and `Boolean`. Default value is `integer`. | + +## Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +type-conv-pipeline: + source: + ... + .... + processor: + - convert_entry_type: + key: "response_status" + type: "integer" +``` +{% include copy.html %} + +Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). + +For example, before you run the `convert_entry_type` processor, if the `logs_json.log` file contains the following event record: + + +```json +{"message": "value", "response_status":"200"} +``` + +The `convert_entry_type` processor converts the output received to the following output, where the type of `response_status` value changes from a string to an integer: + +```json +{"message":"value","response_status":200} +``` diff --git a/_data-prepper/pipelines/configuration/processors/copy-values.md b/_data-prepper/pipelines/configuration/processors/copy-values.md new file mode 100644 index 00000000..f654e6f0 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/copy-values.md @@ -0,0 +1,57 @@ +--- +layout: default +title: copy_values +parent: Processors +grand_parent: Pipelines +nav_order: 48 +--- + +# copy_values + +The `copy_values` processor copies values within an event and is a [mutate event]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/mutate-event/) processor. + +## Configuration + +You can configure the `copy_values` processor with the following options. + +| Option | Required | Description | +:--- | :--- | :--- +| `entries` | Yes | A list of entries to be copied in an event. | +| `from_key` | Yes | The key of the entry to be copied. | +| `to_key` | Yes | The key of the new entry to be added. | +| `overwrite_if_to_key_exists` | No | When set to `true`, the existing value is overwritten if `key` already exists in the event. The default value is `false`. | + +## Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + ... + .... + processor: + - copy_values: + entries: + - from_key: "message" + to_key: "newMessage" + overwrite_if_to_key_exists: true + sink: +``` +{% include copy.html %} + +Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). + +For example, before you run the `copy_values` processor, if the `logs_json.log` file contains the following event record: + +```json +{"message": "hello"} +``` + +When you run this processor, it parses the message into the following output: + +```json +{"message": "hello", "newMessage": "hello"} +``` + +If `newMessage` already exists, its existing value is overwritten with `value`. diff --git a/_data-prepper/pipelines/configuration/processors/csv.md b/_data-prepper/pipelines/configuration/processors/csv.md new file mode 100644 index 00000000..e386db4b --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/csv.md @@ -0,0 +1,116 @@ +--- +layout: default +title: csv +parent: Processors +grand_parent: Pipelines +nav_order: 49 +--- + +# csv + +The `csv` processor parses comma-separated values (CSVs) from the event into columns. + +## Configuration + +The following table describes the options you can use to configure the `csv` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +source | No | String | The field in the event that will be parsed. Default value is `message`. +quote_character | No | String | The character used as a text qualifier for a single column of data. Default value is `"`. +delimiter | No | String | The character separating each column. Default value is `,`. +delete_header | No | Boolean | If specified, the event header (`column_names_source_key`) is deleted after the event is parsed. If there is no event header, no action is taken. Default value is true. +column_names_source_key | No | String | The field in the event that specifies the CSV column names, which will be automatically detected. If there need to be extra column names, the column names are automatically generated according to their index. If `column_names` is also defined, the header in `column_names_source_key` can also be used to generate the event fields. If too few columns are specified in this field, the remaining column names are automatically generated. If too many column names are specified in this field, the CSV processor omits the extra column names. +column_names | No | List | User-specified names for the CSV columns. Default value is `[column1, column2, ..., columnN]` if there are no columns of data in the CSV record and `column_names_source_key` is not defined. If `column_names_source_key` is defined, the header in `column_names_source_key` generates the event fields. If too few columns are specified in this field, the remaining column names are automatically generated. If too many column names are specified in this field, the CSV processor omits the extra column names. + +## Usage + +Add the following examples to your `pipelines.yaml` file, depending on how you your CSV columns are formatted. + +### User-specified column names + +The following example `pipelines.yaml` configuration points to a file named `ingest.csv` as the source. Then, the `csv` processor parses the data from the `.csv` file using the column names specified in the `column_names` setting, as shown in the following example: + +```yaml +csv-pipeline: + source: + file: + path: "/full/path/to/ingest.csv" + record_type: "event" + processor: + - csv: + column_names: ["col1", "col2"] + sink: + - stdout: +``` +{% include copy.html %} + + +When run, the processor will parse the message. Although only two column names are specified in processor settings, a third column name is automatically generated because the data contained in `ingest.csv` includes three columns, `1,2,3`: + +``` +{"message": "1,2,3", "col1": "1", "col2": "2", "column3": "3"} +``` +### Automatically detect column names + +The following configuration automatically detects the header of a CSV file ingested through an [`s3 source`]({{site.url}}{{site.baseurl}}//data-prepper/pipelines/configuration/sources/s3/): + +```yaml +csv-s3-pipeline: + source: + s3: + notification_type: "sqs" + codec: + newline: + skip_lines: 1 + header_destination: "header" + compression: none + sqs: + queue_url: "https://sqs..amazonaws.com//" + aws: + region: "" + processor: + - csv: + column_names_source_key: "header" + sink: + - stdout: +``` +{% include copy.html %} + + +For example, if the `ingest.csv` file in the Amazon Simple Storage Service (Amazon S3) bucket that the Amazon Simple Queue Service (SQS) queue is attached to contains the following data: + +``` +Should,skip,this,line +a,b,c +1,2,3 +``` + +Then the `csv` processor will take the following event: + +```json +{"header": "a,b,c", "message": "1,2,3"} +``` + +Then, the processor parses the event into the following output. Because `delete_header` is `true` by default, the header `a,b,c` is deleted from the output: +```json +{"message": "1,2,3", "a": "1", "b": "2", "c": "3"} +``` + +## Metrics + +The following table describes common [Abstract processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/processor/AbstractProcessor.java) metrics. + +| Metric name | Type | Description | +| ------------- | ---- | -----------| +| `recordsIn` | Counter | Metric representing the ingress of records to a pipeline component. | +| `recordsOut` | Counter | Metric representing the egress of records from a pipeline component. | +| `timeElapsed` | Timer | Metric representing the time elapsed during execution of a pipeline component. | + +The `csv` processor includes the following custom metrics. + +**Counter** + +The `csv` processor includes the following counter metrics: + +* `csvInvalidEvents`: The number of invalid events, usually caused by an unclosed quotation mark in the event itself. Data Prepper throws an exception when an invalid event is parsed. diff --git a/_data-prepper/pipelines/configuration/processors/date.md b/_data-prepper/pipelines/configuration/processors/date.md new file mode 100644 index 00000000..27b571df --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/date.md @@ -0,0 +1,44 @@ +--- +layout: default +title: date +parent: Processors +grand_parent: Pipelines +nav_order: 50 +--- + +# date + + +The `date` processor adds a default timestamp to an event, parses timestamp fields, and converts timestamp information to the International Organization for Standardization (ISO) 8601 format. This timestamp information can be used as an event timestamp. + +## Configuration + +The following table describes the options you can use to configure the `date` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +match | Conditionally | List | List of `key` and `patterns` where patterns is a list. The list of match can have exactly one `key` and `patterns`. There is no default value. This option cannot be defined at the same time as `from_time_received`. Include multiple date processors in your pipeline if both options should be used. +from_time_received | Conditionally | Boolean | A boolean that is used for adding default timestamp to event data from event metadata which is the time when source receives the event. Default value is `false`. This option cannot be defined at the same time as `match`. Include multiple date processors in your pipeline if both options should be used. +destination | No | String | Field to store the timestamp parsed by date processor. It can be used with both `match` and `from_time_received`. Default value is `@timestamp`. +source_timezone | No | String | Time zone used to parse dates. It is used in case the zone or offset cannot be extracted from the value. If the zone or offset are part of the value, then timezone is ignored. Find all the available timezones [the list of database time zones](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List) in the **TZ database name** column. +destination_timezone | No | String | Timezone used for storing timestamp in `destination` field. The available timezone values are the same as `source_timestamp`. +locale | No | String | Locale is used for parsing dates. It's commonly used for parsing month names(`MMM`). It can have language, country and variant fields using IETF BCP 47 or String representation of [Locale](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html) object. For example `en-US` for IETF BCP 47 and `en_US` for string representation of Locale. Full list of locale fields which includes language, country and variant can be found [the language subtag registry](https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry). Default value is `Locale.ROOT`. + + + +## Metrics + +The following table describes common [Abstract processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/processor/AbstractProcessor.java) metrics. + +| Metric name | Type | Description | +| ------------- | ---- | -----------| +| `recordsIn` | Counter | Metric representing the ingress of records to a pipeline component. | +| `recordsOut` | Counter | Metric representing the egress of records from a pipeline component. | +| `timeElapsed` | Timer | Metric representing the time elapsed during execution of a pipeline component. | + +The `date` processor includes the following custom metrics. + +* `dateProcessingMatchSuccessCounter`: Returns the number of records that match with at least one pattern specified by the `match configuration` option. +* `dateProcessingMatchFailureCounter`: Returns the number of records that did not match any of the patterns specified by the `patterns match` configuration option. \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/delete-entries.md b/_data-prepper/pipelines/configuration/processors/delete-entries.md new file mode 100644 index 00000000..0546ed67 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/delete-entries.md @@ -0,0 +1,51 @@ +--- +layout: default +title: delete_entries +parent: Processors +grand_parent: Pipelines +nav_order: 51 +--- + +# delete_entries + +The `delete_entries` processor deletes entries, such as key-value pairs, from an event. You can define the keys you want to delete in the `with-keys` field following `delete_entries` in the YAML configuration file. Those keys and their values are deleted. + +## Configuration + +You can configure the `delete_entries` processor with the following options. + +| Option | Required | Description | +:--- | :--- | :--- +| `with_keys` | Yes | An array of keys for the entries to be deleted. | + +## Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + ... + .... + processor: + - delete_entries: + with_keys: ["message"] + sink: +``` +{% include copy.html %} + +Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). + +For example, before you run the `delete_entries` processor, if the `logs_json.log` file contains the following event record: + +```json +{"message": "hello", "message2": "goodbye"} +``` + +When you run the `delete_entries` processor, it parses the message into the following output: + +```json +{"message2": "goodbye"} +``` + +> If `message` does not exist in the event, then no action occurs. diff --git a/_data-prepper/pipelines/configuration/processors/dissect.md b/_data-prepper/pipelines/configuration/processors/dissect.md new file mode 100644 index 00000000..2d32ba47 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/dissect.md @@ -0,0 +1,96 @@ +--- +layout: default +title: dissect +parent: Processors +grand_parent: Pipelines +nav_order: 52 +--- + +# dissect + +The `dissect` processor extracts values from an event and maps them to individual fields based on user-defined `dissect` patterns. The processor is well suited for field extraction from log messages with a known structure. + +## Basic usage + +To use the `dissect` processor, create the following `pipeline.yaml` file: + +```yaml +dissect-pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - dissect: + map: + log: "%{Date} %{Time} %{Log_Type}: %{Message}" + sink: + - stdout: +``` + +Then create the following file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with the path of a file containing the following JSON data: + +``` +{"log": "07-25-2023 10:00:00 ERROR: error message"} +``` + +The `dissect` processor will retrieve the fields (`Date`, `Time`, `Log_Type`, and `Message`) from the `log` message, based on the pattern `%{Date} %{Time} %{Type}: %{Message}` configured in the pipeline. + +After running the pipeline, you should receive the following standard output: + +``` +{ + "log" : "07-25-2023 10:00:00 ERROR: Some error", + "Date" : "07-25-2023" + "Time" : "10:00:00" + "Log_Type" : "ERROR" + "Message" : "error message" +} +``` + +## Configuration + +You can configure the `dissect` processor with the following options. + +| Option | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| `map` | Yes | Map | Defines the `dissect` patterns for specific keys. For details on how to define fields in the `dissect` pattern, see [Field notations](#field-notations). | +| `target_types` | No | Map | Specifies the data types for extract fields. Valid options are `integer`, `double`, `string`, and `boolean`. By default, all fields are of the `string` type. | +| `dissect_when` | No | String | Specifies a condition for performing the `dissect` operation using a [Data Prepper expression]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). If specified, the `dissect` operation will only run when the expression evaluates to true. | + +### Field notations + +You can define `dissect` patterns with the following field types. + +#### Normal field + +A field without a suffix or prefix. The field will be directly added to the output event. The format is `%{field_name}`. + +#### Skip field + +A field that will not be included in the event. The format is `%{}` or `%{?field_name}`. + +#### Append field + +A field that will be combined with other fields. To append multiple values and include the final value in the field, use `+` before the field name in the `dissect` pattern. The format is `%{+field_name}`. + +For example, with the pattern `%{+field_name}, %{+field_name}`, log message `"foo, bar"` will parse into `{"field_name": "foobar"}`. + +You can also define the order of the concatenation with the help of the suffix `/`. + +For example, with a pattern `"%{+field_name/2}, %{+field_name/1}"`, log message `"foo, bar"` will parse into `{"field_name": "barfoo"}`. + +If the order is not mentioned, the append operation will occur in the order of the fields specified in the `dissect` pattern. + +#### Indirect field + +A field that uses the value from another field as its field name. When defining a pattern, prefix the field with a `&` to assign the value found in the field as the key in the key-value pair. + +For example, with a pattern `"%{?field_name}, %{&field_name}"`, the log message `"foo, bar"` will parse into `{“foo”: “bar”}`. In the log message, `foo` is captured from the skip field `%{?field_name}`. `foo` then serves as the key to the value captured from the field `%{&field_name}`. + +#### Padded field + +A field with the paddings to the right removed. The `->` operator can be used as a suffix to indicate that white spaces after this field can be ignored. + +For example, with a pattern `%{field1->} %{field2}`, log message `“firstname lastname”` will parse into `{“field1”: “firstname”, “field2”: “lastname”}`. diff --git a/_data-prepper/pipelines/configuration/processors/drop-events.md b/_data-prepper/pipelines/configuration/processors/drop-events.md new file mode 100644 index 00000000..d030f14a --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/drop-events.md @@ -0,0 +1,21 @@ +--- +layout: default +title: drop_events +parent: Processors +grand_parent: Pipelines +nav_order: 53 +--- + +# drop_events + + +The `drop_events` processor drops all the events that are passed into it. The following table describes when events are dropped and how exceptions for dropping events are handled. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +drop_when | Yes | String | Accepts a Data Prepper expression string following the [Data Prepper Expression Syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). Configuring `drop_events` with `drop_when: true` drops all the events received. +handle_failed_events | No | Enum | Specifies how exceptions are handled when an exception occurs while evaluating an event. Default value is `drop`, which drops the event so that it is not sent to OpenSearch. Available options are `drop`, `drop_silently`, `skip`, and `skip_silently`. For more information, see [handle_failed_events](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/drop-events-processor#handle_failed_events). + + diff --git a/_data-prepper/pipelines/configuration/processors/grok.md b/_data-prepper/pipelines/configuration/processors/grok.md new file mode 100644 index 00000000..d1eea278 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/grok.md @@ -0,0 +1,72 @@ +--- +layout: default +title: Grok +parent: Processors +grand_parent: Pipelines +nav_order: 54 +--- + +# Grok + +The Grok processor uses pattern matching to structure and extract important keys from unstructured data. + +## Configuration + +The following table describes options you can use with the Grok processor to structure your data and make your data easier to query. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +break_on_match | No | Boolean | Specifies whether to match all patterns or stop once the first successful match is found. Default value is `true`. +grok_when | No | String | Specifies under what condition the `Grok` processor should perform matching. Default is no condition. +keep_empty_captures | No | Boolean | Enables the preservation of `null` captures. Default value is `false`. +keys_to_overwrite | No | List | Specifies which existing keys will be overwritten if there is a capture with the same key value. Default value is `[]`. +match | No | Map | Specifies which keys to match specific patterns against. Default value is an empty body. +named_captures_only | No | Boolean | Specifies whether to keep only named captures. Default value is `true`. +pattern_definitions | No | Map | Allows for custom pattern use inline. Default value is an empty body. +patterns_directories | No | List | Specifies the path of directories that contain customer pattern files. Default value is an empty list. +pattern_files_glob | No | String | Specifies which pattern files to use from the directories specified for `pattern_directories`. Default value is `*`. +target_key | No | String | Specifies a parent-level key used to store all captures. Default value is `null`. +timeout_millis | No | Integer | The maximum amount of time during which matching occurs. Setting to `0` disables the timeout. Default value is `30,000`. + + + +## Conditional grok + +The Grok processor can be configured to run conditionally by using the `grok_when` option. The following is an example Grok processor configuration that uses `grok_when`: +``` +processor: + - grok: + grok_when: '/type == "ipv4"' + match: + message: ['%{IPV4:clientip} %{WORD:request} %{POSINT:bytes}'] + - grok: + grok_when: '/type == "ipv6"' + match: + message: ['%{IPV6:clientip} %{WORD:request} %{POSINT:bytes}'] +``` +The `grok_when` option can take a conditional expression. This expression is detailed in the [Expression syntax](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/) documentation. + +## Metrics + +The following table describes common [Abstract processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/processor/AbstractProcessor.java) metrics. + +| Metric name | Type | Description | +| ------------- | ---- | -----------| +| `recordsIn` | Counter | Metric representing the ingress of records to a pipeline component. | +| `recordsOut` | Counter | Metric representing the egress of records from a pipeline component. | +| `timeElapsed` | Timer | Metric representing the time elapsed during execution of a pipeline component. | + +The Grok processor includes the following custom metrics. + +### Counter + +* `grokProcessingMismatch`: Records the number of records that did not match any of the patterns specified in the match field. +* `grokProcessingMatch`: Records the number of records that matched at least one pattern from the `match` field. +* `grokProcessingErrors`: Records the total number of record processing errors. +* `grokProcessingTimeouts`: Records the total number of records that timed out while matching. + +### Timer + +* `grokProcessingTime`: The time taken by individual records to match against `match` patterns. The `avg` metric is the most useful metric for this timer because because it provides the average time taken to match records. diff --git a/_data-prepper/pipelines/configuration/processors/key-value.md b/_data-prepper/pipelines/configuration/processors/key-value.md new file mode 100644 index 00000000..884ae975 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/key-value.md @@ -0,0 +1,45 @@ +--- +layout: default +title: key_value +parent: Processors +grand_parent: Pipelines +nav_order: 56 +--- + +# key_value + + +You can use the `key_value` processor to parse the specified field into key-value pairs. You can customize the `key_value` processor to parse field information with the following options. The type for each of the following options is `string`. + +| Option | Description | Example | +| :--- | :--- | :--- | +| source | The message field to be parsed. Optional. Default value is `message`. | If `source` is `"message1"`, `{"message1": {"key1=value1"}, "message2": {"key2=value2"}}` parses into `{"message1": {"key1=value1"}, "message2": {"key2=value2"}, "parsed_message": {"key1": "value1"}}`. | +| destination | The destination field for the parsed source. The parsed source overwrites the preexisting data for that key. Optional. If `destination` is set to `null`, the parsed fields will be written to the root of the event. Default value is `parsed_message`. | If `destination` is `"parsed_data"`, `{"message": {"key1=value1"}}` parses into `{"message": {"key1=value1"}, "parsed_data": {"key1": "value1"}}`. | +| field_delimiter_regex | A regular expression specifying the delimiter that separates key-value pairs. Special regular expression characters such as `[` and `]` must be escaped with `\\`. Cannot be defined at the same time as `field_split_characters`. Optional. If this option is not defined, `field_split_characters` is used. | If `field_delimiter_regex` is `"&\\{2\\}"`, `{"key1=value1&&key2=value2"}` parses into `{"key1": "value1", "key2": "value2"}`. | +| field_split_characters | A string of characters specifying the delimeter that separates key-value pairs. Special regular expression characters such as `[` and `]` must be escaped with `\\`. Cannot be defined at the same time as `field_delimiter_regex`. Optional. Default value is `&`. | If `field_split_characters` is `"&&"`, `{"key1=value1&&key2=value2"}` parses into `{"key1": "value1", "key2": "value2"}`. | +| key_value_delimiter_regex | A regular expression specifying the delimiter that separates the key and value within a key-value pair. Special regular expression characters such as `[` and `]` must be escaped with `\\`. This option cannot be defined at the same time as `value_split_characters`. Optional. If this option is not defined, `value_split_characters` is used. | If `key_value_delimiter_regex` is `"=\\{2\\}"`, `{"key1==value1"}` parses into `{"key1": "value1"}`. | +| value_split_characters | A string of characters specifying the delimiter that separates the key and value within a key-value pair. Special regular expression characters such as `[` and `]` must be escaped with `\\`. Cannot be defined at the same time as `key_value_delimiter_regex`. Optional. Default value is `=`. | If `value_split_characters` is `"=="`, `{"key1==value1"}` parses into `{"key1": "value1"}`. | +| non_match_value | When a key-value pair cannot be successfully split, the key-value pair is placed in the `key` field, and the specified value is placed in the `value` field. Optional. Default value is `null`. | `key1value1&key2=value2` parses into `{"key1value1": null, "key2": "value2"}`. | +| prefix | A prefix to append before all keys. Optional. Default value is an empty string. | If `prefix` is `"custom"`, `{"key1=value1"}` parses into `{"customkey1": "value1"}`.| +| delete_key_regex | A regular expression specifying the characters to delete from the key. Special regular expression characters such as `[` and `]` must be escaped with `\\`. Cannot be an empty string. Optional. No default value. | If `delete_key_regex` is `"\s"`, `{"key1 =value1"}` parses into `{"key1": "value1"}`. | +| delete_value_regex | A regular expression specifying the characters to delete from the value. Special regular expression characters such as `[` and `]` must be escaped with `\\`. Cannot be an empty string. Optional. No default value. | If `delete_value_regex` is `"\s"`, `{"key1=value1 "}` parses into `{"key1": "value1"}`. | +| include_keys | An array specifying the keys that should be added for parsing. By default, all keys will be added. | If `include_keys` is `["key2"]`,`key1=value1&key2=value2` will parse into `{"key2": "value2"}`. | +| exclude_keys | An array specifying the parsed keys that should not be added to the event. By default, no keys will be excluded. | If `exclude_keys` is `["key2"]`, `key1=value1&key2=value2` will parse into `{"key1": "value1"}`. | +| default_values | A map specifying the default keys and their values that should be added to the event in case these keys do not exist in the source field being parsed. If the default key already exists in the message, the value is not changed. The `include_keys` filter will be applied to the message before `default_values`. | If `default_values` is `{"defaultkey": "defaultvalue"}`, `key1=value1` will parse into `{"key1": "value1", "defaultkey": "defaultvalue"}`.
If `default_values` is `{"key1": "abc"}`, `key1=value1` will parse into `{"key1": "value1"}`.
If `include_keys` is `["key1"]` and `default_values` is `{"key2": "value2"}`, `key1=value1&key2=abc` will parse into `{"key1": "value1", "key2": "value2"}`. | +| transform_key | When to lowercase, uppercase, or capitalize keys. | If `transform_key` is `lowercase`, `{"Key1=value1"}` will parse into `{"key1": "value1"}`.
If `transform_key` is `uppercase`, `{"key1=value1"}` will parse into `{"KEY1": "value1"}`.
If `transform_key` is `capitalize`, `{"key1=value1"}` will parse into `{"Key1": "value1"}`. | +| whitespace | Specifies whether to be lenient or strict with the acceptance of unnecessary white space surrounding the configured value-split sequence. Default is `lenient`. | If `whitespace` is `"lenient"`, `{"key1 = value1"}` will parse into `{"key1 ": " value1"}`. If `whitespace` is `"strict"`, `{"key1 = value1"}` will parse into `{"key1": "value1"}`. | +| skip_duplicate_values | A Boolean option for removing duplicate key-value pairs. When set to `true`, only one unique key-value pair will be preserved. Default is `false`. | If `skip_duplicate_values` is `false`, `{"key1=value1&key1=value1"}` will parse into `{"key1": ["value1", "value1"]}`. If `skip_duplicate_values` is `true`, `{"key1=value1&key1=value1"}` will parse into `{"key1": "value1"}`. | +| remove_brackets | Specifies whether to treat square brackets, angle brackets, and parentheses as value "wrappers" that should be removed from the value. Default is `false`. | If `remove_brackets` is `true`, `{"key1=(value1)"}` will parse into `{"key1": value1}`. If `remove_brackets` is `false`, `{"key1=(value1)"}` will parse into `{"key1": "(value1)"}`. | +| recursive | Specifies whether to recursively obtain additional key-value pairs from values. The extra key-value pairs will be stored as sub-keys of the root key. Default is `false`. The levels of recursive parsing must be defined by different brackets for each level: `[]`, `()`, and `<>`, in this order. Any other configurations specified will only be applied to the outmost keys.
When `recursive` is `true`:
`remove_brackets` cannot also be `true`;
`skip_duplicate_values` will always be `true`;
`whitespace` will always be `"strict"`. | If `recursive` is true, `{"item1=[item1-subitem1=item1-subitem1-value&item1-subitem2=(item1-subitem2-subitem2A=item1-subitem2-subitem2A-value&item1-subitem2-subitem2B=item1-subitem2-subitem2B-value)]&item2=item2-value"}` will parse into `{"item1": {"item1-subitem1": "item1-subitem1-value", "item1-subitem2" {"item1-subitem2-subitem2A": "item1-subitem2-subitem2A-value", "item1-subitem2-subitem2B": "item1-subitem2-subitem2B-value"}}}`. | +| overwrite_if_destination_exists | Specifies whether to overwrite existing fields if there are key conflicts when writing parsed fields to the event. Default is `true`. | If `overwrite_if_destination_exists` is `true` and destination is `null`, `{"key1": "old_value", "message": "key1=new_value"}` will parse into `{"key1": "new_value", "message": "key1=new_value"}`. | +| tags_on_failure | When a `kv` operation causes a runtime exception within the processor, the operation is safely stopped without crashing the processor, and the event is tagged with the provided tags. | If `tags_on_failure` is set to `["keyvalueprocessor_failure"]`, `{"tags": ["keyvalueprocessor_failure"]}` will be added to the event's metadata in the event of a runtime exception. | + + + + \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/list-to-map.md b/_data-prepper/pipelines/configuration/processors/list-to-map.md new file mode 100644 index 00000000..4b137f5c --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/list-to-map.md @@ -0,0 +1,305 @@ +--- +layout: default +title: list_to_map +parent: Processors +grand_parent: Pipelines +nav_order: 58 +--- + +# list_to_map + +The `list_to_map` processor converts a list of objects from an event, where each object contains a `key` field, into a map of target keys. + +## Configuration + +The following table describes the configuration options used to generate target keys for the mappings. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`key` | Yes | String | The key of the fields to be extracted as keys in the generated mappings. +`source` | Yes | String | The list of objects with `key` fields to be converted into keys for the generated map. +`target` | No | String | The target for the generated map. When not specified, the generated map will be placed in the root node. +`value_key` | No | String | When specified, values given a `value_key` in objects contained in the source list will be extracted and converted into the value specified by this option based on the generated map. When not specified, objects contained in the source list retain their original value when mapped. +`flatten` | No | Boolean | When `true`, values in the generated map output flatten into single items based on the `flattened_element`. Otherwise, objects mapped to values from the generated map appear as lists. +`flattened_element` | Conditionally | String | The element to keep, either `first` or `last`, when `flatten` is set to `true`. + +## Usage + +The following example shows how to test the usage of the `list_to_map` processor before using the processor on your own source. + +Create a source file named `logs_json.log`. Because the `file` source reads each line in the `.log` file as an event, the object list appears as one line even though it contains multiple objects: + +```json +{"mylist":[{"name":"a","value":"val-a"},{"name":"b","value":"val-b1"},{"name":"b", "value":"val-b2"},{"name":"c","value":"val-c"}]} +``` +{% include copy.html %} + +Next, create a `pipeline.yaml` file that uses the `logs_json.log` file as the `source` by pointing to the `.log` file's correct path: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - list_to_map: + key: "name" + source: "mylist" + value_key: "value" + flatten: true + sink: + - stdout: +``` +{% include copy.html %} + +Run the pipeline. If successful, the processor returns the generated map with objects mapped according to their `value_key`. Similar to the original source, which contains one line and therefore one event, the processor returns the following JSON as one line. For readability, the following example and all subsequent JSON examples have been adjusted to span multiple lines: + +```json +{ + "mylist": [ + { + "name": "a", + "value": "val-a" + }, + { + "name": "b", + "value": "val-b1" + }, + { + "name": "b", + "value": "val-b2" + }, + { + "name": "c", + "value": "val-c" + } + ], + "a": "val-a", + "b": "val-b1", + "c": "val-c" +} +``` + +### Example: Maps set to `target` + +The following example `pipeline.yaml` file shows the `list_to_map` processor when set to a specified target, `mymap`: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - list_to_map: + key: "name" + source: "mylist" + target: "mymap" + value_key: "value" + flatten: true + sink: + - stdout: +``` +{% include copy.html %} + +The generated map appears under the target key: + +```json +{ + "mylist": [ + { + "name": "a", + "value": "val-a" + }, + { + "name": "b", + "value": "val-b1" + }, + { + "name": "b", + "value": "val-b2" + }, + { + "name": "c", + "value": "val-c" + } + ], + "mymap": { + "a": "val-a", + "b": "val-b1", + "c": "val-c" + } +} +``` + +### Example: No `value_key` specified + +The follow example `pipeline.yaml` file shows the `list_to_map` processor with no `value_key` specified. Because `key` is set to `name`, the processor extracts the object names to use as keys in the map. + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - list_to_map: + key: "name" + source: "mylist" + flatten: true + sink: + - stdout: +``` +{% include copy.html %} + +The values from the generated map appear as original objects from the `.log` source, as shown in the following example response: + +```json +{ + "mylist": [ + { + "name": "a", + "value": "val-a" + }, + { + "name": "b", + "value": "val-b1" + }, + { + "name": "b", + "value": "val-b2" + }, + { + "name": "c", + "value": "val-c" + } + ], + "a": { + "name": "a", + "value": "val-a" + }, + "b": { + "name": "b", + "value": "val-b1" + }, + "c": { + "name": "c", + "value": "val-c" + } +} +``` + +### Example: `flattened_element` set to `last` + +The following example `pipeline.yaml` file sets the `flattened_element` to last, therefore flattening the processor output based on each value's last element: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - list_to_map: + key: "name" + source: "mylist" + target: "mymap" + value_key: "value" + flatten: true + flattened_element: "last" + sink: + - stdout: +``` +{% include copy.html %} + +The processor maps object `b` to value `val-b2` because `val-b2` is the last element in object `b`, as shown in the following output: + +```json +{ + "mylist": [ + { + "name": "a", + "value": "val-a" + }, + { + "name": "b", + "value": "val-b1" + }, + { + "name": "b", + "value": "val-b2" + }, + { + "name": "c", + "value": "val-c" + } + ], + "a": "val-a", + "b": "val-b2", + "c": "val-c" +} +``` + + +### Example: `flatten` set to false + +The following example `pipeline.yaml` file sets `flatten` to `false`, causing the processor to output values from the generated map as a list: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - list_to_map: + key: "name" + source: "mylist" + target: "mymap" + value_key: "value" + flatten: false + sink: + - stdout: +``` +{% include copy.html %} + +Some objects in the response may have more than one element in their values, as shown in the following response: + +```json +{ + "mylist": [ + { + "name": "a", + "value": "val-a" + }, + { + "name": "b", + "value": "val-b1" + }, + { + "name": "b", + "value": "val-b2" + }, + { + "name": "c", + "value": "val-c" + } + ], + "a": [ + "val-a" + ], + "b": [ + "val-b1", + "val-b2" + ], + "c": [ + "val-c" + ] +} +``` \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/lowercase-string.md b/_data-prepper/pipelines/configuration/processors/lowercase-string.md new file mode 100644 index 00000000..34ca0597 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/lowercase-string.md @@ -0,0 +1,24 @@ +--- +layout: default +title: lowercase_string +parent: Processors +grand_parent: Pipelines +nav_order: 60 +--- + +# lowercase_string + + +The `lowercase_string` processor converts a string to its lowercase counterpart and is a [mutate string](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-string-processors#mutate-string-processors) processor. The following table describes options for configuring the `lowercase_string` processor to convert strings to a lowercase format. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +with_keys | Yes | List | A list of keys to convert to lowercase. + + diff --git a/_data-prepper/pipelines/configuration/processors/mutate-event.md b/_data-prepper/pipelines/configuration/processors/mutate-event.md new file mode 100644 index 00000000..032bc89f --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/mutate-event.md @@ -0,0 +1,21 @@ +--- +layout: default +title: Mutate event +parent: Processors +grand_parent: Pipelines +nav_order: 65 +--- + +# Mutate event processors + +Mutate event processors allow you to modify events in Data Prepper. The following processors are available: + +* [add_entries]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/add-entries/) allows you to add entries to an event. +* [copy_values]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/copy-values/) allows you to copy values within an event. +* [delete_entries]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/delete-entries/) allows you to delete entries from an event. +* [rename_keys]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/rename-keys/) allows you to rename keys in an event. +* [convert_entry_type]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/convert_entry_type/) allows you to convert value types in an event. +* [list_to_map]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/list-to-map) allows you to convert list of objects from an event where each object contains a `key` field into a map of target keys. + + + diff --git a/_data-prepper/pipelines/configuration/processors/mutate-string.md b/_data-prepper/pipelines/configuration/processors/mutate-string.md new file mode 100644 index 00000000..48f64236 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/mutate-string.md @@ -0,0 +1,256 @@ +--- +layout: default +title: Mutate string +parent: Processors +grand_parent: Pipelines +nav_order: 70 +--- + +# Mutate string processors + +You can change the way that a string appears by using a mutate string processesor. For example, you can use the `uppercase_string` processor to convert a string to uppercase, and you can use the `lowercase_string` processor to convert a string to lowercase. The following is a list of processors that allow you to mutate a string: + +* [substitute_string](#substitute_string) +* [split_string](#split_string) +* [uppercase_string](#uppercase_string) +* [lowercase_string](#lowercase_string) +* [trim_string](#trim_string) + +## substitute_string + +The `substitute_string` processor matches a key's value against a regular expression (regex) and replaces all returned matches with a replacement string. + +### Configuration + +You can configure the `substitute_string` processor with the following options. + +Option | Required | Description +:--- | :--- | :--- +`entries` | Yes | A list of entries to add to an event. | +`source` | Yes | The key to be modified. | +`from` | Yes | The regex string to be replaced. Special regex characters such as `[` and `]` must be escaped using `\\` when using double quotes and `\` when using single quotes. For more information, see [Class Pattern](https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/util/regex/Pattern.html) in the Java documentation. | +`to` | Yes | The string that replaces each match of `from`. | + +### Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - substitute_string: + entries: + - source: "message" + from: ":" + to: "-" + sink: + - stdout: +``` +{% include copy.html %} + +Next, create a log file named `logs_json.log`. After that, replace the `path` of the file source in your `pipeline.yaml` file with your file path. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). + +Before you run Data Prepper, the source appears in the following format: + +```json +{"message": "ab:cd:ab:cd"} +``` + +After you run Data Prepper, the source is converted to the following format: + +```json +{"message": "ab-cd-ab-cd"} +``` + +`from` defines which string is replaced, and `to` defines the string that replaces the `from` string. In the preceding example, string `ab:cd:ab:cd` becomes `ab-cd-ab-cd`. If the `from` regex string does not return a match, the key is returned without any changes. + +## split_string + +The `split_string` processor splits a field into an array using a delimiter character. + +### Configuration + +You can configure the `split_string` processor with the following options. + +Option | Required | Description +:--- | :--- | :--- + `entries` | Yes | A list of entries to add to an event. | + `source` | Yes | The key to be split. | + `delimiter` | No | The separator character responsible for the split. Cannot be defined at the same time as `delimiter_regex`. At least `delimiter` or `delimiter_regex` must be defined. | +`delimiter_regex` | No | A regex string responsible for the split. Cannot be defined at the same time as `delimiter`. Either `delimiter` or `delimiter_regex` must be defined. | + +### Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - split_string: + entries: + - source: "message" + delimiter: "," + sink: + - stdout: +``` +{% include copy.html %} + +Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with your file path. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). + +Before you run Data Prepper, the source appears in the following format: + +```json +{"message": "hello,world"} +``` +After you run Data Prepper, the source is converted to the following format: + +```json +{"message":["hello","world"]} +``` + +## uppercase_string + +The `uppercase_string` processor converts the value (a string) of a key from its current case to uppercase. + +### Configuration + +You can configure the `uppercase_string` processor with the following options. + +Option | Required | Description +:--- | :--- | :--- + `with_keys` | Yes | A list of keys to convert to uppercase. | + +### Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - uppercase_string: + with_keys: + - "uppercaseField" + sink: + - stdout: +``` +{% include copy.html %} + +Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with the correct file path. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). + +Before you run Data Prepper, the source appears in the following format: + +```json +{"uppercaseField": "hello"} +``` +After you run Data Prepper, the source is converted to the following format: + +```json +{"uppercaseField": "HELLO"} +``` + +## lowercase_string + +The `lowercase string` processor converts a string to lowercase. + +### Configuration + +You can configure the `lowercase string` processor with the following options. + +Option | Required | Description +:--- | :--- | :--- + `with_keys` | Yes | A list of keys to convert to lowercase. | + +### Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - lowercase_string: + with_keys: + - "lowercaseField" + sink: + - stdout: +``` +{% include copy.html %} + +Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with the correct file path. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). + +Before you run Data Prepper, the source appears in the following format: + +```json +{"lowercaseField": "TESTmeSSage"} +``` + +After you run Data Prepper, the source is converted to the following format: + +```json +{"lowercaseField": "testmessage"} +``` + +## trim_string + +The `trim_string` processor removes white space from the beginning and end of a key. + +### Configuration + +You can configure the `trim_string` processor with the following options. + +Option | Required | Description +:--- | :--- | :--- + `with_keys` | Yes | A list of keys from which to trim the white space. | + +### Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - trim_string: + with_keys: + - "trimField" + sink: + - stdout: +``` +{% include copy.html %} + +Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with the correct file path. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). + +Before you run Data Prepper, the source appears in the following format: + +```json +{"trimField": " Space Ship "} +``` + +After you run Data Prepper, the source is converted to the following format: + +```json +{"trimField": "Space Ship"} +``` diff --git a/_data-prepper/pipelines/configuration/processors/obfuscate.md b/_data-prepper/pipelines/configuration/processors/obfuscate.md new file mode 100644 index 00000000..4c33d8ba --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/obfuscate.md @@ -0,0 +1,95 @@ +--- +layout: default +title: obfuscate +parent: Processors +grand_parent: Pipelines +nav_order: 71 +--- + +# obfuscate + +The `obfuscate` process enables obfuscation of fields inside your documents in order to protect sensitive data. + +## Usage + +In this example, a document contains a `log` field and a `phone` field, as shown in the following object: + +```json +{ + "id": 1, + "phone": "(555) 555 5555", + "log": "My name is Bob and my email address is abc@example.com" +} +``` + + +To obfuscate the `log` and `phone` fields, add the `obfuscate` processor and call each field in the `source` option. To account for both the `log` and `phone` fields, the following example uses multiple `obfuscate` processors because each processor can only obfuscate one source. + +In the first `obfuscate` processor in the pipeline, the source `log` uses several configuration options to mask the data in the log field, as shown in the following example. For more details on these options, see [configuration](#configuration). + +```yaml +pipeline: + source: + http: + processor: + - obfuscate: + source: "log" + target: "new_log" + patterns: + - "[A-Za-z0-9+_.-]+@([\\w-]+\\.)+[\\w-]{2,4}" + action: + mask: + mask_character: "#" + mask_character_length: 6 + - obfuscate: + source: "phone" + sink: + - stdout: +``` + +When run, the `obfuscate` processor parses the fields into the following output: + +```json +{ + "id": 1, + "phone": "***", + "log": "My name is Bob and my email address is abc@example.com", + "newLog": "My name is Bob and my email address is ######" +} +``` + +## Configuration + +Use the following configuration options with the `obfuscate` processor. + +| Parameter | Required | Description | +| :--- | :--- | :--- | +| `source` | Yes | The source field to obfuscate. | +| `target` | No | The new field in which to store the obfuscated value. This leaves the original source field unchanged. When no `target` is provided, the source field updates with the obfuscated value. | +| `patterns` | No | A list of regex patterns that allow you to obfuscate specific parts of a field. Only parts that match the regex pattern will obfuscate. When not provided, the processor obfuscates the whole field. | +| `action` | No | The obfuscation action. As of Data Prepper 2.3, only the `mask` action is supported. | + +You can customize the `mask` action with the following optional configuration options. + +| Parameter | Default | Description | +| :--- | :--- | :--- | +`mask_character` | `*` | The character to use when masking. Valid characters are !, #, $, %, &, *, and @. | +`mask_character_length` | `3` | The number of characters to mask in the field. The value must be between 1 and 10. | + + +## Predefined patterns + +When using the `patterns` configuration option, you can use a set of predefined obfuscation patterns for common fields. The `obfuscate` processor supports the following predefined patterns. + +You cannot use multiple patterns for one obfuscate processor. Use one pattern for each obfuscate processor. +{: .note} + + +| Pattern name | Examples | +|-----------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| %{EMAIL_ADDRESS} | abc@test.com
123@test.com
abc123@test.com
abc_123@test.com
a-b@test.com
a.b@test.com
abc@test-test.com
abc@test.com.cn
abc@test.mail.com.org | +| %{IP_ADDRESS_V4} | 1.1.1.1
192.168.1.1
255.255.255.0 | +| %{BASE_NUMBER} | 1.1
.1
2000 | +| %{CREDIT_CARD_NUMBER} | 5555555555554444
4111111111111111
1234567890123456
1234 5678 9012 3456
1234-5678-9012-3456 | +| %{US_PHONE_NUMBER} | 1555 555 5555
5555555555
1-555-555-5555
1-(555)-555-5555
1(555) 555 5555
(555) 555 5555
+1-555-555-5555
| +| %{US_SSN_NUMBER} | 123-11-1234 diff --git a/_data-prepper/pipelines/configuration/processors/otel-metrics.md b/_data-prepper/pipelines/configuration/processors/otel-metrics.md new file mode 100644 index 00000000..08fb7281 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/otel-metrics.md @@ -0,0 +1,150 @@ +--- +layout: default +title: otel_metrics +parent: Processors +grand_parent: Pipelines +nav_order: 72 +--- + +# otel_metrics + +The `otel_metrics` processor serializes a collection of `ExportMetricsServiceRequest` records sent from the [OTel metrics source]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/otel-metrics-source/) into a collection of string records. + +## Usage + +To get started, add the following processor to your `pipeline.yaml` configuration file: + +``` yaml +processor: + - otel_metrics_raw_processor: +``` +{% include copy.html %} + +## Configuration + +You can use the following optional parameters to configure histogram buckets and their default values. A histogram displays numerical data by grouping data into buckets. You can use histogram buckets to view sets of events that are organized by the total event count and aggregate sum for all events. For more detailed information, see [OpenTelemetry Histograms](https://opentelemetry.io/docs/reference/specification/metrics/data-model/#histogram). + +| Parameter | Default value | Description | +| :--- | :--- | :--- | +| `calculate_histogram_buckets` | `True` | Whether or not to calculate histogram buckets. | +| `calculate_exponential_histogram_buckets` | `True` | Whether or not to calculate exponential histogram buckets. | +| `exponential_histogram_max_allowed_scale` | `10` | Maximum allowed scale in exponential histogram calculation. | +| `flatten_attributes` | `False` | Whether or not to flatten the `attributes` field in the JSON data. | + +### calculate_histogram_buckets + +If `calculate_histogram_buckets` is not set to `false`, then the following `JSON` file will be added to every histogram JSON. If `flatten_attributes` is set to `false`, the `JSON` string format of the metrics does not change the attributes field. If `flatten_attributes` is set to `true`, the values in the attributes field are placed in the parent `JSON` object. The default value is `true`. See the following `JSON` example: + +```json + "buckets": [ + { + "min": 0.0, + "max": 5.0, + "count": 2 + }, + { + "min": 5.0, + "max": 10.0, + "count": 5 + } + ] +``` + +You can create detailed representations of histogram buckets and their boundaries. You can control this feature by using the following parameters in your `pipeline.yaml` file: + +```yaml + processor: + - otel_metrics_raw_processor: + calculate_histogram_buckets: true + calculate_exponential_histogram_buckets: true + exponential_histogram_max_allowed_scale: 10 + flatten_attributes: false +``` +{% include copy.html %} + +Each array element describes one bucket. Each bucket contains the lower boundary, upper boundary, and its value count. This is a specific form of more detailed OpenTelemetry representation that is a part of the `JSON` output created by the `otel_metrics` processor. See the following `JSON` file, which is added to each `JSON` histogram by the `otel_metrics` processor: + +```json + "explicitBounds": [ + 5.0, + 10.0 + ], + "bucketCountsList": [ + 2, + 5 + ] +``` + + + +### calculate_exponential_histogram_buckets + +If `calculate_exponential_histogram_buckets` is set to `true` (the default setting), the following `JSON` values are added to each `JSON` histogram: + +```json + + "negativeBuckets": [ + { + "min": 0.0, + "max": 5.0, + "count": 2 + }, + { + "min": 5.0, + "max": 10.0, + "count": 5 + } + ], +... + "positiveBuckets": [ + { + "min": 0.0, + "max": 5.0, + "count": 2 + }, + { + "min": 5.0, + "max": 10.0, + "count": 5 + } + ], +``` + +The following `JSON` file is a more detailed form of OpenTelemetry representation that consists of negative and positive buckets, a scale parameter, an offset, and a list of bucket counts: + + +```json + "negative": [ + 1, + 2, + 3 + ], + "positive": [ + 1, + 2, + 3 + ], + "scale" : -3, + "negativeOffset" : 0, + "positiveOffset" : 1 +``` + + +### exponential_histogram_max_allowed_scale + +The `exponential_histogram_max_allowed_scale` parameter defines the maximum allowed scale for an exponential histogram. If you increase this parameter, you will increase potential memory consumption. See the [OpenTelemetry specifications](https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/metrics/v1/metrics.proto) for more information on exponential histograms and their computational complexity. + +All exponential histograms that have a scale that is above the configured parameter (by default, a value of `10`) are discarded and logged with an error level. You can check the log that Data Prepper creates to see the `ERROR` log message. + +The absolute scale value is used for comparison, so a scale of `-11` that is treated equally to `11` exceeds the configured value of `10` and can be discarded. +{: .note} + +## Metrics + +The following table describes metrics that are common to all processors. + +| Metric name | Type | Description | +| ------------- | ---- | -----------| +| `recordsIn` | Counter | Metric representing the number of ingress records. | +| `recordsOut` | Counter | Metric representing the number of egress records. | +| `timeElapsed` | Timer | Metric representing the time elapsed during execution of records. | diff --git a/_data-prepper/pipelines/configuration/processors/otel-trace-group.md b/_data-prepper/pipelines/configuration/processors/otel-trace-group.md new file mode 100644 index 00000000..06bc754a --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/otel-trace-group.md @@ -0,0 +1,79 @@ +--- +layout: default +title: otel_trace_group +parent: Processors +grand_parent: Pipelines +nav_order: 73 +--- + +# otel_trace_group + +The `otel_trace_group` processor completes missing trace-group-related fields in the collection of [span](https://github.com/opensearch-project/data-prepper/blob/834f28fdf1df6d42a6666e91e6407474b88e7ec6/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/trace/Span.java) records by looking up the OpenSearch backend. The `otel_trace_group` processor identifies the missing trace group information for a `spanId` by looking up the relevant fields in its root `span` stored in OpenSearch. + +## OpenSearch + +When you connect to an OpenSearch cluster using your username and password, use the following example `pipeline.yaml` file to configure the `otel_trace_group` processor: + +``` YAML +pipeline: + ... + processor: + - otel_trace_group: + hosts: ["https://localhost:9200"] + cert: path/to/cert + username: YOUR_USERNAME_HERE + password: YOUR_PASSWORD_HERE +``` + +See [OpenSearch security]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sinks/opensearch/#opensearch-cluster-security) for a more detailed explanation of which OpenSearch credentials and permissions are required and how to configure those credentials for the OTel trace group processor. + +### Amazon OpenSearch Service + +When you use [Amazon OpenSearch Service]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sinks/opensearch/#amazon-opensearch-service-domain-security), use the following example `pipeline.yaml` file to configure the `otel_trace_group` processor: + +``` YAML +pipeline: + ... + processor: + - otel_trace_group: + hosts: ["https://your-amazon-opensearch-service-endpoint"] + aws_sigv4: true + cert: path/to/cert + insecure: false +``` + +## Configuration + +You can configure the `otel_trace_group` processor with the following options. + +| Name | Description | Default value | +| -----| ----| -----------| +| `hosts`| A list of IP addresses of OpenSearch nodes. Required. | No default value. | +| `cert` | A certificate authority (CA) certificate that is PEM encoded. Accepts both .pem or .crt. This enables the client to trust the CA that has signed the certificate that OpenSearch is using. | `null` | +| `aws_sigv4` | A Boolean flag used to sign the HTTP request with AWS credentials. Only applies to Amazon OpenSearch Service. See [OpenSearch security](https://github.com/opensearch-project/data-prepper/blob/129524227779ee35a327c27c3098d550d7256df1/data-prepper-plugins/opensearch/security.md) for details. | `false`. | +| `aws_region` | A string that represents the AWS Region of the Amazon OpenSearch Service domain, for example, `us-west-2`. Only applies to Amazon OpenSearch Service. | `us-east-1` | +| `aws_sts_role_arn`| An AWS Identity and Access Management (IAM) role that the sink plugin assumes to sign the request to Amazon OpenSearch Service. If not provided, the plugin uses the [default credentials](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/auth/credentials/DefaultCredentialsProvider.html). | `null` | +| `aws_sts_header_overrides` | A map of header overrides that the IAM role assumes for the sink plugin. | `null` | +| `insecure` | A Boolean flag used to turn off SSL certificate verification. If set to `true`, CA certificate verification is turned off and insecure HTTP requests are sent. | `false` | +| `username` | A string that contains the username and is used in the [internal users](https://opensearch.org/docs/latest/security/access-control/users-roles/) `YAML` configuration file of your OpenSearch cluster. | `null` | +| `password` | A string that contains the password and is used in the [internal users](https://opensearch.org/docs/latest/security/access-control/users-roles/) `YAML` configuration file of your OpenSearch cluster. | `null` | + +## Configuration option examples + +You can define the configuration option values in the `aws_sts_header_overrides` option. See the following example: + +``` +aws_sts_header_overrides: + x-my-custom-header-1: my-custom-value-1 + x-my-custom-header-2: my-custom-value-2 +``` + +## Metrics + +The following table describes custom metrics specific to the `otel_trace_group` processor. + +| Metric name | Type | Description | +| ------------- | ---- | ----------- | +| `recordsInMissingTraceGroup` | Counter | The number of ingress records missing trace group fields. | +| `recordsOutFixedTraceGroup` | Counter | The number of egress records with successfully completed trace group fields. | +| `recordsOutMissingTraceGroup` | Counter | The number of egress records missing trace group fields. | \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/otel-trace-raw.md b/_data-prepper/pipelines/configuration/processors/otel-trace-raw.md new file mode 100644 index 00000000..395956a6 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/otel-trace-raw.md @@ -0,0 +1,44 @@ +--- +layout: default +title: otel_trace +parent: Processors +grand_parent: Pipelines +nav_order: 75 +--- + +# otel_trace + +The `otel_trace` processor completes trace-group-related fields in all incoming Data Prepper span records by state caching the root span information for each `traceId`. + +## Parameters + +This processor includes the following parameters. + +* `traceGroup`: Root span name +* `endTime`: End time of the entire trace in International Organization for Standardization (ISO) 8601 format +* `durationInNanos`: Duration of the entire trace in nanoseconds +* `statusCode`: Status code for the entire trace in nanoseconds + +## Configuration + +The following table describes the options you can use to configure the `otel_trace` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +trace_flush_interval | No | Integer | Represents the time interval in seconds to flush all the descendant spans without any root span. Default is 180. + + +## Metrics + +The following table describes common [Abstract processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/processor/AbstractProcessor.java) metrics. + +| Metric name | Type | Description | +| ------------- | ---- | -----------| +| `recordsIn` | Counter | Metric representing the ingress of records to a pipeline component. | +| `recordsOut` | Counter | Metric representing the egress of records from a pipeline component. | +| `timeElapsed` | Timer | Metric representing the time elapsed during execution of a pipeline component. | + +The `otel_trace` processor includes the following custom metrics: + +* `traceGroupCacheCount`: The number of trace groups in the trace group cache. +* `spanSetCount`: The number of span sets in the span set collection. diff --git a/_data-prepper/pipelines/configuration/processors/parse-json.md b/_data-prepper/pipelines/configuration/processors/parse-json.md new file mode 100644 index 00000000..2cbce478 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/parse-json.md @@ -0,0 +1,77 @@ +--- +layout: default +title: parse_json +parent: Processors +grand_parent: Pipelines +nav_order: 80 +--- + +# parse_json + +The `parse_json` processor parses JSON data for an event, including any nested fields. The processor extracts the JSON pointer data and adds the input event to the extracted fields. + + +## Configuration + +You can configure the `parse_json` processor with the following options. + +| Option | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| `source` | No | String | The field in the `event` that will be parsed. Default value is `message`. | +| `destination` | No | String | The destination field of the parsed JSON. Defaults to the root of the `event`. Cannot be `""`, `/`, or any white-space-only `string` because these are not valid `event` fields. | +| `pointer` | No | String | A JSON pointer to the field to be parsed. There is no `pointer` by default, meaning the entire `source` is parsed. The `pointer` can access JSON array indexes as well. If the JSON pointer is invalid then the entire `source` data is parsed into the outgoing `event`. If the key that is pointed to already exists in the `event` and the `destination` is the root, then the pointer uses the entire path of the key. | + +## Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +parse-json-pipeline: + source: + ... + .... + processor: + - parse_json: +``` + +### Basic example + +To test the `parse_json` processor with the previous configuration, run the pipeline and paste the following line into your console, then enter `exit` on a new line: + +``` +{"outer_key": {"inner_key": "inner_value"}} +``` +{% include copy.html %} + +The `parse_json` processor parses the message into the following format: + +``` +{"message": {"outer_key": {"inner_key": "inner_value"}}", "outer_key":{"inner_key":"inner_value"}}} +``` + +### Example with a JSON pointer + +You can use a JSON pointer to parse a selection of the JSON data by specifying the `pointer` option in the configuration. To get started, create the following `pipeline.yaml` file: + +```yaml +parse-json-pipeline: + source: + ... + .... + processor: + - parse_json: + pointer: "outer_key/inner_key" +``` + +To test the `parse_json` processor with the pointer option, run the pipeline, paste the following line into your console, and then enter `exit` on a new line: + +``` +{"outer_key": {"inner_key": "inner_value"}} +``` +{% include copy.html %} + +The processor parses the message into the following format: + +``` +{"message": {"outer_key": {"inner_key": "inner_value"}}", "inner_key": "inner_value"} +``` \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/processors.md b/_data-prepper/pipelines/configuration/processors/processors.md new file mode 100644 index 00000000..3000d716 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/processors.md @@ -0,0 +1,14 @@ +--- +layout: default +title: Processors +has_children: true +parent: Pipelines +nav_order: 25 +--- + +# Processors + +Processors perform an action on your data, such as filtering, transforming, or enriching. + +Prior to Data Prepper 1.3, processors were named preppers. Starting in Data Prepper 1.3, the term *prepper* is deprecated in favor of the term *processor*. Data Prepper will continue to support the term *prepper* until 2.0, where it will be removed. +{: .note } \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/rename-keys.md b/_data-prepper/pipelines/configuration/processors/rename-keys.md new file mode 100644 index 00000000..f57b4e50 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/rename-keys.md @@ -0,0 +1,98 @@ +--- +layout: default +title: rename_keys +parent: Processors +grand_parent: Pipelines +nav_order: 85 +--- + +# rename_keys + +The `rename_keys` processor renames keys in an event. + +## Configuration + +You can configure the `rename_keys` processor with the following options. + +| Option | Required | Description | +| :--- | :--- | :--- | +| `entries` | Yes | A list of event entries to rename. | +| `from_key` | Yes | The key of the entry to be renamed. | +| `to_key` | Yes | The new key of the entry. | +| `overwrite_if_to_key_exists` | No | When set to `true`, the existing value is overwritten if `key` already exists in the event. The default value is `false`. | + +## Usage + +To get started, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - rename_keys: + entries: + - from_key: "message" + to_key: "newMessage" + overwrite_if_to_key_exists: true + sink: + - stdout: +``` +{% include copy.html %} + + +Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). + +For example, before you run the `rename_keys` processor, if the `logs_json.log` file contains the following event record: + +```json +{"message": "hello"} +``` + +When you run the `rename_keys` processor, it parses the message into the following "newMessage" output: + +```json +{"newMessage": "hello"} +``` + +> If `newMessage` already exists, its existing value is overwritten with `value`. + + + +## Special considerations + +Renaming operations occur in the order that the key-value pair entries are listed in the `pipeline.yaml` file. This means that chaining (where key-value pairs are renamed in sequence) is implicit in the `rename_keys` processor. See the following example `pipeline.yaml` file: + +```yaml +pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - rename_keys: + entries: + - from_key: "message" + to_key: "message2" + - from_key: "message2" + to_key: "message3" + sink: + - stdout: +``` + +Add the following contents to the `logs_json.log` file: + +```json +{"message": "hello"} +``` +{% include copy.html %} + +After the `rename_keys` processor runs, the following output appears: + +```json +{"message3": "hello"} +``` \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/routes.md b/_data-prepper/pipelines/configuration/processors/routes.md new file mode 100644 index 00000000..eb451537 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/routes.md @@ -0,0 +1,19 @@ +--- +layout: default +title: routes +parent: Processors +grand_parent: Pipelines +nav_order: 90 +--- + +# Routes + +Routes define conditions that can be used in sinks for conditional routing. Routes are specified at the same level as processors and sinks under the name `route` and consist of a list of key-value pairs, where the key is the name of a route and the value is a Data Prepper expression representing the routing condition. + + \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/service-map-stateful.md b/_data-prepper/pipelines/configuration/processors/service-map-stateful.md new file mode 100644 index 00000000..a05f4486 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/service-map-stateful.md @@ -0,0 +1,38 @@ +--- +layout: default +title: service_map +parent: Processors +grand_parent: Pipelines +nav_order: 95 +--- + +# service_map + +The `service_map` processor uses OpenTelemetry data to create a distributed service map for visualization in OpenSearch Dashboards. + +## Configuration + +The following table describes the option you can use to configure the `service_map` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +window_duration | No | Integer | Represents the fixed time window, in seconds, during which service map relationships are evaluated. Default value is 180. + + + +## Metrics + +The following table describes common [Abstract processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-api/src/main/java/org/opensearch/dataprepper/model/processor/AbstractProcessor.java) metrics. + +| Metric name | Type | Description | +| ------------- | ---- | -----------| +| `recordsIn` | Counter | Metric representing the ingress of records to a pipeline component. | +| `recordsOut` | Counter | Metric representing the egress of records from a pipeline component. | +| `timeElapsed` | Timer | Metric representing the time elapsed during execution of a pipeline component. | + +The `service-map-stateful` processor includes following custom metrics: + +* `traceGroupCacheCount`: The number of trace groups in the trace group cache. +* `spanSetCount`: The number of span sets in the span set collection. \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/split-string.md b/_data-prepper/pipelines/configuration/processors/split-string.md new file mode 100644 index 00000000..3959ae5a --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/split-string.md @@ -0,0 +1,27 @@ +--- +layout: default +title: split_string +parent: Processors +grand_parent: Pipelines +nav_order: 100 +--- + +# split_string + + +The `split_string` processor splits a field into an array using a delimiting character and is a [mutate string](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-string-processors#mutate-string-processors) processor. The following table describes the options you can use to configure the `split_string` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +entries | Yes | List | List of entries. Valid values are `source`, `delimiter`, and `delimiter_regex`. +source | N/A | N/A | The key to split. +delimiter | No | N/A | The separator character responsible for the split. Cannot be defined at the same time as `delimiter_regex`. At least `delimiter` or `delimiter_regex` must be defined. +delimiter_regex | No | N/A | The regex string responsible for the split. Cannot be defined at the same time as `delimiter`. At least `delimiter` or `delimiter_regex` must be defined. + + \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/string-converter.md b/_data-prepper/pipelines/configuration/processors/string-converter.md new file mode 100644 index 00000000..32055791 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/string-converter.md @@ -0,0 +1,24 @@ +--- +layout: default +title: string_converter +parent: Processors +grand_parent: Pipelines +nav_order: 105 +--- + +# string_converter + + +The `string_converter` processor converts a string to uppercase or lowercase. You can use it as an example for developing your own processor. The following table describes the option you can use to configure the `string_converter` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +upper_case | No | Boolean | Whether to convert to uppercase (`true`) or lowercase (`false`). + + \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/substitute-string.md b/_data-prepper/pipelines/configuration/processors/substitute-string.md new file mode 100644 index 00000000..5d18bf6a --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/substitute-string.md @@ -0,0 +1,24 @@ +--- +layout: default +title: substitute_string +parent: Processors +grand_parent: Pipelines +nav_order: 110 +--- + +# substitute_string + +The `substitute_string` processor matches a key's value against a regular expression and replaces all matches with a replacement string. `substitute_string` is a [mutate string](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-string-processors#mutate-string-processors) processor. + +## Configuration + +The following table describes the options you can use to configure the `substitute_string` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +entries | Yes | List | List of entries. Valid values are `source`, `from`, and `to`. +source | N/A | N/A | The key to modify. +from | N/A | N/A | The Regex String to be replaced. Special regex characters such as `[` and `]` must be escaped using `\\` when using double quotes and `\ ` when using single quotes. See [Java Patterns](https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/util/regex/Pattern.html) for more information. +to | N/A | N/A | The String to be substituted for each match of `from`. + + diff --git a/_data-prepper/pipelines/configuration/processors/trace-peer-forwarder.md b/_data-prepper/pipelines/configuration/processors/trace-peer-forwarder.md new file mode 100644 index 00000000..a73295b8 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/trace-peer-forwarder.md @@ -0,0 +1,52 @@ +--- +layout: default +title: trace_peer_forwarder +parent: Processors +grand_parent: Pipelines +nav_order: 115 +--- + +# trace peer forwarder + +The `trace_peer_forwarder` processor is used with [peer forwarder]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/peer-forwarder/) to reduce by half the number of events forwarded in a [Trace Analytics]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/trace-analytics/) pipeline. In Trace Analytics, each event is typically duplicated when it is sent from `otel-trace-pipeline` to `raw-pipeline` and `service-map-pipeline`. When pipelines forward events, this causes the core peer forwarder to send multiple HTTP requests for the same event. You can use `trace peer forwarder` to forward an event once through the `otel-trace-pipeline` instead of `raw-pipeline` and `service-map-pipeline`, which prevents unnecessary HTTP requests. + +You should use `trace_peer_forwarder` for Trace Analytics pipelines when you have multiple nodes. + +## Usage + +To get started with `trace_peer_forwarder`, first configure [peer forwarder]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/peer-forwarder/). Then create a `pipeline.yaml` file and specify `trace peer forwarder` as the processor. You can configure `peer forwarder` in your `data-prepper-config.yaml` file. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). + +See the following example `pipeline.yaml` file: + +```yaml +otel-trace-pipeline: + delay: "100" + source: + otel_trace_source: + processor: + - trace_peer_forwarder: + sink: + - pipeline: + name: "raw-pipeline" + - pipeline: + name: "service-map-pipeline" +raw-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - otel_trace_raw: + sink: + - opensearch: +service-map-pipeline: + delay: "100" + source: + pipeline: + name: "entry-pipeline" + processor: + - service_map_stateful: + sink: + - opensearch: +``` + +In the preceding `pipeline.yaml` file, events are forwarded in the `otel-trace-pipeline` to the target peer, and no forwarding is performed in `raw-pipeline` or `service-map-pipeline`. This process helps improve network performance by forwarding events (as HTTP requests) once instead of twice. \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/translate.md b/_data-prepper/pipelines/configuration/processors/translate.md new file mode 100644 index 00000000..d29aa589 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/translate.md @@ -0,0 +1,176 @@ +--- +layout: default +title: translate +parent: Processors +grand_parent: Pipelines +nav_order: 117 +--- + +# translate + +The `translate` processor transforms values in events into preconfigured values. + +## Basic usage + +To use the `translate` processor, create the following `pipeline.yaml` file: + +```yaml +translate-pipeline: + source: + file: + path: "/full/path/to/logs_json.log" + record_type: "event" + format: "json" + processor: + - translate: + mappings: + - source: "status" + targets: + - target: "translated_result" + map: + 404: "Not Found" + sink: + - stdout: +``` + +Then create the following file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with the path of a file containing the following JSON data: + +```json +{ "status": "404" } +``` + +The `translate` processor configuration in `pipeline.yaml` retrieves the `source` value from the event data and compares it against the keys specified under the `targets`. +When a match is found, the processor places the corresponding mapped value into the `target` key provided in the configuration. + +When you run Data Prepper with the previous `pipeline.yaml` file, you should receive the following output: + +```json +{ + "status": "404", + "translated_result": "Not Found" +} +``` + +## Advanced options + +The following example shows a more involved mapping with additional configurations for the `translate` processor: + +```yaml +processor: + - translate: + mappings: + - source: "status" + targets: + - target: "translated_result" + map: + 404: "Not Found" + default: "default" + type: "string" + translate_when: "/response != null" + - target: "another_translated_result" + regex: + exact: false + patterns: + "2[0-9]{2}" : "Success" # Matches ranges from 200-299 + "5[0-9]{2}": "Error" # Matches ranges form 500-599 + file: + name: "path/to/file.yaml" + aws: + bucket: my_bucket + region: us-east-1 + sts_role_arn: arn:aws:iam::123456789012:role/MyS3Role +``` + +On the top level, specify `mappings` for inline mapping configurations, or `file` pull mapping configurations from a file. Both `mappings` and `file` options can be specified together, and the processor considers the mappings from both sources for translations. In instances where the pipeline configuration and file mappings share duplicate `source` and `target` pairs, the mappings specified within the pipeline configuration take precedence. + + +## Configuration + +You can use the following options to configure the `translate` processor. + +| Parameter | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| mappings | No | List | Defines inline mappings. For more information, see [mappings](#mappings). | +| file | No | Map | Points to the file that contains mapping configurations. For more information, see [file](#file). | + +### mappings + +Each item in the `mappings` configuration contains the following options. + +| Parameter | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| source | Yes | String or list | The source field to translate. Can be a string or a list of strings. | +| targets | Yes | List | A list of target field configurations, such as the target field key or translation maps. | + +Each item in the `targets` configuration contains the following options. + +| Parameter | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| target | Yes | String | The key that specifies the field in the output in which the translated value will be placed. | +| map | No | Map | A list of key-value pairs that define the translations. Each key represents a possible value in the source field, and the corresponding value represents what it should be translated to. For examples, see [map option](#map-option). At least one of `map` and `regex` should be configured. | +| regex | No | Map | A map of keys that defines the translation map. For more options, see [regex option](#regex-option). At least one of `map` and `regex` should be configured. | +| default | No | String | The default value to use when no match is found during translation. | +| type | No | String | Specifies the data type for the target value. | +| translate_when | No | String | Uses a [Data Prepper expression]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/) to specify a condition for performing the translation. When specified, the expression will only translate when the condition is met. | + +#### map option + +You can use the following key types when using the map option: + +* Individual keys + ```yaml + map: + ok : "Success" + 120: "Found" + ``` +* Number ranges + ```yaml + map: + "100-200": "Success" + "400-499": "Error" + ``` +* Comma-delimited keys + ```yaml + map: + "key1,key2,key3": "value1" + "100-200,key4": "value2" + ``` + +When configuring the keys inside the `map` option, do not use any overlapping number ranges or duplicate keys. + +#### regex option + +You can use the following options with the `regex` option. + +| Parameter | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| patterns | Yes | Map | A map of key-value pairs defining the regex patterns of keys and the value to translate to for each pattern. | +| exact | No | Boolean | Whether to use full string match or partial string match on the regex pattern. If `true`, the pattern is considered a match only when the entire key matches the pattern. Otherwise, the pattern is considered a match when a sub-string of the key matches the pattern. | + +### file + +The `file` option in the `translate` processor takes a local YAML file or an Amazon Simple Storage Service (Amazon S3) object containing translation mappings. The file's contents should be in the following format: +```yaml +mappings: + - source: "status" + targets: + - target: "result" + map: + "foo": "bar" + # Other configurations +``` + +You can use the following options in the `file` configuration. + +| Parameter | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| name | Yes | String | The full path to a local file or key name for an S3 object. | +| aws | No | Map | The AWS configuration when the file is an S3 object. See the following table for more information. | + +You can use the following options with the `aws` configuration. + +| Parameter | Required | Type | Description | +| :--- | :--- | :--- | :--- | +| `bucket` | Yes | String | The Amazon S3 bucket name. | +| `region` | Yes | String | The AWS Region to use for credentials. | +| `sts_role_arn` | Yes | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon S3. | diff --git a/_data-prepper/pipelines/configuration/processors/trim-string.md b/_data-prepper/pipelines/configuration/processors/trim-string.md new file mode 100644 index 00000000..46b6ad4a --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/trim-string.md @@ -0,0 +1,23 @@ +--- +layout: default +title: trim_string +parent: Processors +grand_parent: Pipelines +nav_order: 120 +--- + +# trim_string + +The `trim_string` processor removes white space from the beginning and end of a key and is a [mutate string](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-string-processors#mutate-string-processors) processor. The following table describes the option you can use to configure the `trim_string` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +with_keys | Yes | List | A list of keys to trim the white space from. + + \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/uppercase-string.md b/_data-prepper/pipelines/configuration/processors/uppercase-string.md new file mode 100644 index 00000000..88e6e901 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/uppercase-string.md @@ -0,0 +1,23 @@ +--- +layout: default +title: uppercase_string +parent: Processors +grand_parent: Pipelines +nav_order: 125 +--- + +# uppercase_string + +The `uppercase_string` processor converts an entire string to uppercase and is a [mutate string](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-string-processors#mutate-string-processors) processor. The following table describes the option you can use to configure the `uppercase_string` processor. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +with_keys | Yes | List | A list of keys to convert to uppercase. + + \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/user-agent.md b/_data-prepper/pipelines/configuration/processors/user-agent.md new file mode 100644 index 00000000..8d2592a5 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/user-agent.md @@ -0,0 +1,63 @@ +--- +layout: default +title: user_agent +parent: Processors +grand_parent: Pipelines +nav_order: 130 +--- + +# user_agent + +The `user_agent` processor parses any user agent (UA) string in an event and then adds the parsing results to the event's write data. + +## Usage + +In this example, the `user_agent` processor calls the source that contains the UA string, the `ua` field, and indicates the key to which the parsed string will write, `user_agent`, as shown in the following example: + +```yaml + processor: + - user_agent: + source: "ua" + target: "user_agent" +``` + +The following example event contains the `ua` field with a string that provides information about a user: + +```json +{ + "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Mobile/15E148 Safari/604.1" +} +``` + +The `user_agent` processor parses the string into a format compatible with Elastic Common Schema (ECS) and then adds the result to the specified target, as shown in the following example: + +```json +{ + "user_agent": { + "original": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Mobile/15E148 Safari/604.1", + "os": { + "version": "13.5.1", + "full": "iOS 13.5.1", + "name": "iOS" + }, + "name": "Mobile Safari", + "version": "13.1.1", + "device": { + "name": "iPhone" + } + }, + "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Mobile/15E148 Safari/604.1" +} +``` + +## Configuration options + +You can use the following configuration options with the `user_agent` processor. + +| Option | Required | Description | +| :--- | :--- | :--- | +| `source` | Yes | The field in the event that will be parsed. +| `target` | No | The field to which the parsed event will write. Default is `user_agent`. +| `exclude_original` | No | Determines whether to exclude the original UA string from the parsing result. Defaults to `false`. +| `cache_size` | No | The cache size of the parser in megabytes. Defaults to `1000`. | +| `tags_on_parse_failure` | No | The tag to add to an event if the `user_agent` processor fails to parse the UA string. | diff --git a/_data-prepper/pipelines/configuration/sinks/file.md b/_data-prepper/pipelines/configuration/sinks/file.md new file mode 100644 index 00000000..74af5a18 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sinks/file.md @@ -0,0 +1,31 @@ +--- +layout: default +title: file +parent: Sinks +grand_parent: Pipelines +nav_order: 45 +--- + +# file + +Use the `file` sink to create a flat file output, usually a `.log` file. + +## Configuration options + +The following table describes options you can configure for the `file` sink. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +path | Yes | String | Path for the output file (e.g. `logs/my-transformed-log.log`). + +## Usage + +The following example shows basic usage of the `file` sink: + +``` +sample-pipeline: + sink: + - file: + path: path/to/output-file +``` + diff --git a/_data-prepper/pipelines/configuration/sinks/opensearch.md b/_data-prepper/pipelines/configuration/sinks/opensearch.md new file mode 100644 index 00000000..b4861f68 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sinks/opensearch.md @@ -0,0 +1,304 @@ +--- +layout: default +title: opensearch +parent: Sinks +grand_parent: Pipelines +nav_order: 50 +--- + +# opensearch + +You can use the `opensearch` sink plugin to send data to an OpenSearch cluster, a legacy Elasticsearch cluster, or an Amazon OpenSearch Service domain. + +The plugin supports OpenSearch 1.0 and later and Elasticsearch 7.3 and later. + +## Usage + +To configure an `opensearch` sink, specify the `opensearch` option within the pipeline configuration: + +```yaml +pipeline: + ... + sink: + opensearch: + hosts: ["https://localhost:9200"] + cert: path/to/cert + username: YOUR_USERNAME + password: YOUR_PASSWORD + index_type: trace-analytics-raw + dlq_file: /your/local/dlq-file + max_retries: 20 + bulk_size: 4 +``` + +To configure an [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/what-is.html) sink, specify the domain endpoint as the `hosts` option, as shown in the following example: + +```yaml +pipeline: + ... + sink: + opensearch: + hosts: ["https://your-amazon-opensearch-service-endpoint"] + aws_sigv4: true + cert: path/to/cert + insecure: false + index_type: trace-analytics-service-map + bulk_size: 4 +``` + +## Configuration options + +The following table describes options you can configure for the `opensearch` sink. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +hosts | Yes | List | List of OpenSearch hosts to write to (for example, `["https://localhost:9200", "https://remote-cluster:9200"]`). +cert | No | String | Path to the security certificate (for example, `"config/root-ca.pem"`) if the cluster uses the OpenSearch Security plugin. +username | No | String | Username for HTTP basic authentication. +password | No | String | Password for HTTP basic authentication. +aws_sigv4 | No | Boolean | Default value is false. Whether to use AWS Identity and Access Management (IAM) signing to connect to an Amazon OpenSearch Service domain. For your access key, secret key, and optional session token, Data Prepper uses the default credential chain (environment variables, Java system properties, `~/.aws/credential`, etc.). +aws_region | No | String | The AWS region (for example, `"us-east-1"`) for the domain if you are connecting to Amazon OpenSearch Service. +aws_sts_role_arn | No | String | IAM role that the plugin uses to sign requests sent to Amazon OpenSearch Service. If this information is not provided, the plugin uses the default credentials. +[max_retries](#configure-max_retries) | No | Integer | The maximum number of times the OpenSearch sink should try to push data to the OpenSearch server before considering it to be a failure. Defaults to `Integer.MAX_VALUE`. If not provided, the sink will try to push data to the OpenSearch server indefinitely because the default value is high and exponential backoff would increase the waiting time before retry. +socket_timeout | No | Integer | The timeout, in milliseconds, waiting for data to return (or the maximum period of inactivity between two consecutive data packets). A timeout value of zero is interpreted as an infinite timeout. If this timeout value is negative or not set, the underlying Apache HttpClient would rely on operating system settings for managing socket timeouts. +connect_timeout | No | Integer | The timeout in milliseconds used when requesting a connection from the connection manager. A timeout value of zero is interpreted as an infinite timeout. If this timeout value is negative or not set, the underlying Apache HttpClient would rely on operating system settings for managing connection timeouts. +insecure | No | Boolean | Whether or not to verify SSL certificates. If set to true, certificate authority (CA) certificate verification is disabled and insecure HTTP requests are sent instead. Default value is `false`. +proxy | No | String | The address of a [forward HTTP proxy server](https://en.wikipedia.org/wiki/Proxy_server). The format is "<host name or IP>:<port>". Examples: "example.com:8100", "http://example.com:8100", "112.112.112.112:8100". Port number cannot be omitted. +index | Conditionally | String | Name of the export index. Applicable and required only when the `index_type` is `custom`. +index_type | No | String | This index type tells the Sink plugin what type of data it is handling. Valid values: `custom`, `trace-analytics-raw`, `trace-analytics-service-map`, `management-disabled`. Default value is `custom`. +template_type | No | String | Defines what type of OpenSearch template to use. The available options are `v1` and `index-template`. The default value is `v1`, which uses the original OpenSearch templates available at the `_template` API endpoints. The `index-template` option uses composable [index templates]({{site.url}}{{site.baseurl}}/opensearch/index-templates/) which are available through OpenSearch's `_index_template` API. Composable index types offer more flexibility than the default and are necessary when an OpenSearch cluster has already existing index templates. Composable templates are available for all versions of OpenSearch and some later versions of Elasticsearch. When `distribution_version` is set to `es6`, Data Prepper enforces the `template_type` as `v1`. +template_file | No | String | The path to a JSON [index template]({{site.url}}{{site.baseurl}}/opensearch/index-templates/) file such as `/your/local/template-file.json` when `index_type` is set to `custom`. For an example template file, see [otel-v1-apm-span-index-template.json](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/opensearch/src/main/resources/otel-v1-apm-span-index-template.json). If you supply a template file it must match the template format specified by the `template_type` parameter. +document_id_field | No | String | The field from the source data to use for the OpenSearch document ID (for example, `"my-field"`) if `index_type` is `custom`. +dlq_file | No | String | The path to your preferred dead letter queue file (for example, `/your/local/dlq-file`). Data Prepper writes to this file when it fails to index a document on the OpenSearch cluster. +dlq | No | N/A | DLQ configurations. See [Dead Letter Queues]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/dlq/) for details. If the `dlq_file` option is also available, the sink will fail. +bulk_size | No | Integer (long) | The maximum size (in MiB) of bulk requests sent to the OpenSearch cluster. Values below 0 indicate an unlimited size. If a single document exceeds the maximum bulk request size, Data Prepper sends it individually. Default value is 5. +ism_policy_file | No | String | The absolute file path for an ISM (Index State Management) policy JSON file. This policy file is effective only when there is no built-in policy file for the index type. For example, `custom` index type is currently the only one without a built-in policy file, thus it would use the policy file here if it's provided through this parameter. For more information, see [ISM policies]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/). +number_of_shards | No | Integer | The number of primary shards that an index should have on the destination OpenSearch server. This parameter is effective only when `template_file` is either explicitly provided in Sink configuration or built-in. If this parameter is set, it would override the value in index template file. For more information, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). +number_of_replicas | No | Integer | The number of replica shards each primary shard should have on the destination OpenSearch server. For example, if you have 4 primary shards and set number_of_replicas to 3, the index has 12 replica shards. This parameter is effective only when `template_file` is either explicitly provided in Sink configuration or built-in. If this parameter is set, it would override the value in index template file. For more information, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). +distribution_version | No | String | Indicates whether the sink backend version is Elasticsearch 6 or later. `es6` represents Elasticsearch 6. `default` represents the latest compatible backend version, such as Elasticsearch 7.x, OpenSearch 1.x, or OpenSearch 2.x. Default is `default`. +enable_request_compression | No | Boolean | Whether to enable compression when sending requests to OpenSearch. When `distribution_version` is set to `es6`, default is `false`. For all other distribution versions, default is `true`. +serverless | No | Boolean | Determines whether the OpenSearch backend is Amazon OpenSearch Serverless. Set this value to `true` when the destination for the `opensearch` sink is an Amazon OpenSearch Serverless collection. Default is `false`. +serverless_options | No | Object | The network configuration options available when the backend of the `opensearch` sink is set to Amazon OpenSearch Serverless. For more information, see [Serverless options](#serverless-options). + +### Serverless options + +The following options can be used in the `serverless_options` object. + +Option | Required | Type | Description +:--- | :--- | :---| :--- +network_policy_name | Yes | String | The name of the network policy to create. +collection_name | Yes | String | The name of the Amazon OpenSearch Serverless collection to configure. +vpce_id | Yes | String | The virtual private cloud (VPC) endpoint to which the source connects. + +### Configure max_retries + +You can include the `max_retries` option in your pipeline configuration to control the number of times the source tries to write to sinks with exponential backoff. If you don't include this option, pipelines keep retrying forever. + +If you specify `max_retries` and a pipeline has a [dead-letter queue (DLQ)]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/dlq/) configured, the pipeline will keep trying to write to sinks until it reaches the maximum number of retries, at which point it starts to send failed data to the DLQ. + +If you don't specify `max_retries`, only data that is rejected by sinks is written to the DLQ. Pipelines continue to try to write all other data to the sinks. + +## OpenSearch cluster security + +In order to send data to an OpenSearch cluster using the `opensearch` sink plugin, you must specify your username and password within the pipeline configuration. The following example `pipelines.yaml` file demonstrates how to specify admin security credentials: + +```yaml +sink: + - opensearch: + username: "admin" + password: "admin" + ... +``` + +Alternately, rather than admin credentials, you can specify the credentials of a user mapped to a role with the minimum permissions listed in the following sections. + +### Cluster permissions + +- `cluster_all` +- `indices:admin/template/get` +- `indices:admin/template/put` + +### Index permissions + +- Index: `otel-v1*`; Index permission: `indices_all` +- Index: `.opendistro-ism-config`; Index permission: `indices_all` +- Index: `*`; Index permission: `manage_aliases` + +For instructions on how to map users to roles, see [Map users to roles]({{site.url}}{{site.baseurl}}/security/access-control/users-roles/#map-users-to-roles). + +## Amazon OpenSearch Service domain security + +The `opensearch` sink plugin can send data to an [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/what-is.html) domain, which uses IAM for security. The plugin uses the default credential chain. Run `aws configure` using the [AWS Command Line Interface (AWS CLI)](https://aws.amazon.com/cli/) to set your credentials. + +Make sure the credentials that you configure have the required IAM permissions. The following domain access policy demonstrates the minimum required permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam:::user/data-prepper-user" + }, + "Action": "es:ESHttp*", + "Resource": [ + "arn:aws:es:us-east-1::domain//otel-v1*", + "arn:aws:es:us-east-1::domain//_template/otel-v1*", + "arn:aws:es:us-east-1::domain//_plugins/_ism/policies/raw-span-policy", + "arn:aws:es:us-east-1::domain//_alias/otel-v1*", + "arn:aws:es:us-east-1::domain//_alias/_bulk" + ] + }, + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam:::user/data-prepper-user" + }, + "Action": "es:ESHttpGet", + "Resource": "arn:aws:es:us-east-1::domain//_cluster/settings" + } + ] +} +``` + +For instructions on how to configure the domain access policy, see [Resource-based policies +](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/ac.html#ac-types-resource) in the Amazon OpenSearch Service documentation. + +### Fine-grained access control + +If your OpenSearch Service domain uses [fine-grained access control](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/fgac.html), the `opensearch` sink plugin requires some additional configuration. + +#### IAM ARN as master user + +If you're using an IAM Amazon Resource Name (ARN) as the master user, include the `aws_sigv4` option in your sink configuration: + +```yaml +... +sink: + opensearch: + hosts: ["https://your-fgac-amazon-opensearch-service-endpoint"] + aws_sigv4: true +``` + +Run `aws configure` using the AWS CLI to use the master IAM user credentials. If you don't want to use the master user, you can specify a different IAM role using the `aws_sts_role_arn` option. The plugin will then use this role to sign requests sent to the domain sink. The ARN that you specify must be included in the [domain access policy]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sinks/opensearch/#amazon-opensearch-service-domain-security). + +#### Master user in the internal user database + +If your domain uses a master user in the internal user database, specify the master username and password as well as the `aws_sigv4` option: + +```yaml +sink: + opensearch: + hosts: ["https://your-fgac-amazon-opensearch-service-endpoint"] + aws_sigv4: false + username: "master-username" + password: "master-password" +``` + +For more information, see [Recommended configurations](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/fgac.html#fgac-recommendations) in the Amazon OpenSearch Service documentation. + +***Note***: You can create a new IAM role or internal user database user with the `all_access` permission and use it instead of the master user. + +## OpenSearch Serverless collection security + +The `opensearch` sink plugin can send data to an [Amazon OpenSearch Serverless](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless.html) collection. + +OpenSearch Serverless collection sinks have the following limitations: + +- You can't write to a collection that uses virtual private cloud (VPC) access. The collection must be accessible from public networks. +- The OTel trace group processor doesn't currently support collection sinks. + +### Creating a pipeline role + +First, create an IAM role that the pipeline will assume in order to write to the collection. The role must have the following minimum permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "aoss:BatchGetCollection" + ], + "Resource": "*" + } + ] +} +``` + +The role must have the following trust relationship, which allows the pipeline to assume it: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam:::root" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` + +### Creating a collection + +Next, create a collection with the following settings: + +- Public [network access](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-network.html) to both the OpenSearch endpoint and OpenSearch Dashboards. +- The following [data access policy](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-data-access.html), which grants the required permissions to the pipeline role: + + ```json + [ + { + "Rules":[ + { + "Resource":[ + "index/collection-name/*" + ], + "Permission":[ + "aoss:CreateIndex", + "aoss:UpdateIndex", + "aoss:DescribeIndex", + "aoss:WriteDocument" + ], + "ResourceType":"index" + } + ], + "Principal":[ + "arn:aws:iam:::role/PipelineRole" + ], + "Description":"Pipeline role access" + } + ] + ``` + + ***Important***: Make sure to replace the ARN in the `Principal` element with the ARN of the pipeline role that you created in the preceding step. + + For instructions on how to create collections, see [Creating collections](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-manage.html#serverless-create) in the Amazon OpenSearch Service documentation. + +### Creating a pipeline + +Within your `pipelines.yaml` file, specify the OpenSearch Serverless collection endpoint as the `hosts` option. In addition, you must set the `serverless` option to `true`. Specify the pipeline role in the `sts_role_arn` option: + +```yaml +log-pipeline: + source: + http: + processor: + - date: + from_time_received: true + destination: "@timestamp" + sink: + - opensearch: + hosts: [ "https://" ] + index: "my-serverless-index" + aws: + serverless: true + sts_role_arn: "arn:aws:iam:::role/PipelineRole" + region: "us-east-1" +``` diff --git a/_data-prepper/pipelines/configuration/sinks/pipeline.md b/_data-prepper/pipelines/configuration/sinks/pipeline.md new file mode 100644 index 00000000..3cba75a2 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sinks/pipeline.md @@ -0,0 +1,30 @@ +--- +layout: default +title: pipeline +parent: Sinks +grand_parent: Pipelines +nav_order: 55 +--- + +# pipeline + +Use the `pipeline` sink to write to another pipeline. + +## Configuration options + +The `pipeline` sink supports the following configuration options. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +name | Yes | String | Name of the pipeline to write to. + +## Usage + +The following example configures a `pipeline` sink that writes to a pipeline named `movies`: + +``` +sample-pipeline: + sink: + - pipeline: + name: movies +``` diff --git a/_data-prepper/pipelines/configuration/sinks/s3.md b/_data-prepper/pipelines/configuration/sinks/s3.md new file mode 100644 index 00000000..cb881e81 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sinks/s3.md @@ -0,0 +1,158 @@ +--- +layout: default +title: s3 +parent: Sinks +grand_parent: Pipelines +nav_order: 55 +--- + +# s3 + +The `s3` sink saves batches of events to [Amazon Simple Storage Service (Amazon S3)](https://aws.amazon.com/s3/) objects. + +## Usage + +The following example creates a pipeline configured with an s3 sink. It contains additional options for customizing the event and size thresholds for which the pipeline sends record events and sets the codec type `ndjson`: + +``` +pipeline: + ... + sink: + - s3: + aws: + region: us-east-1 + sts_role_arn: arn:aws:iam::123456789012:role/Data-Prepper + sts_header_overrides: + max_retries: 5 + bucket: + name: bucket_name + object_key: + path_prefix: my-elb/%{yyyy}/%{MM}/%{dd}/ + threshold: + event_count: 2000 + maximum_size: 50mb + event_collect_timeout: 15s + codec: + ndjson: + buffer_type: in_memory +``` + +## Configuration + +Use the following options when customizing the `s3` sink. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`bucket` | Yes | String | The object from which the data is retrieved and then stored. The `name` must match the name of your object store. +`codec` | Yes | [Buffer type](#buffer-type) | Determines the buffer type. +`aws` | Yes | AWS | The AWS configuration. See [aws](#aws) for more information. +`threshold` | Yes | [Threshold](#threshold-configuration) | Configures when to write an object to S3. +`object_key` | No | Sets the `path_prefix` and the `file_pattern` of the object store. Defaults to the S3 object `events-%{yyyy-MM-dd'T'hh-mm-ss}` found inside the root directory of the bucket. +`compression` | No | String | The compression algorithm to apply: `none`, `gzip`, or `snappy`. Default is `none`. +`buffer_type` | No | [Buffer type](#buffer-type) | Determines the buffer type. +`max_retries` | No | Integer | The maximum number of times a single request should retry when ingesting data to S3. Defaults to `5`. + +## aws + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`region` | No | String | The AWS Region to use for credentials. Defaults to [standard SDK behavior to determine the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon SQS and Amazon S3. Defaults to `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`sts_header_overrides` | No | Map | A map of header overrides that the IAM role assumes for the sink plugin. +`sts_external_id` | No | String | The external ID to attach to AssumeRole requests from AWS STS. + + +## Threshold configuration + +Use the following options to set ingestion thresholds for the `s3` sink. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`event_count` | Yes | Integer | The maximum number of events the S3 bucket can ingest. +`maximum_size` | Yes | String | The maximum number of bytes that the S3 bucket can ingest after compression. Defaults to `50mb`. +`event_collect_timeout` | Yes | String | Sets the time period during which events are collected before ingestion. All values are strings that represent duration, either an ISO_8601 notation string, such as `PT20.345S`, or a simple notation, such as `60s` or `1500ms`. + + +## Buffer type + +`buffer_type` is an optional configuration that records stored events temporarily before flushing them into an S3 bucket. The default value is `in_memory`. Use one of the following options: + +- `in_memory`: Stores the record in memory. +- `local_file`: Flushes the record into a file on your machine. +- `multipart`: Writes using the [S3 multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpuoverview.html). Every 10 MB is written as a part. + +## Object key configuration + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`path_prefix` | Yes | String | The S3 key prefix path to use. Accepts date-time formatting. For example, you can use `%{yyyy}/%{MM}/%{dd}/%{HH}/` to create hourly folders in S3. By default, events write to the root of the bucket. + + +## codec + +The `codec` determines how the `s3` source formats data written to each S3 object. + +### avro codec + +The `avro` codec writes an event as an [Apache Avro](https://avro.apache.org/) document. + +Because Avro requires a schema, you may either define the schema yourself, or Data Prepper will automatically generate a schema. +In general, you should define your own schema because it will most accurately reflect your needs. + +We recommend that you make your Avro fields use a null [union](https://avro.apache.org/docs/current/specification/#unions). +Without the null union, each field must be present or the data will fail to write to the sink. +If you can be certain that each each event has a given field, you can make it non-nullable. + +When you provide your own Avro schema, that schema defines the final structure of your data. +Therefore, any extra values inside any incoming events that are not mapped in the Arvo schema will not be included in the final destination. +To avoid confusion between a custom Arvo schema and the `include_keys` or `exclude_keys` sink configurations, Data Prepper does not allow the use of the `include_keys` or `exclude_keys` with a custom schema. + +In cases where your data is uniform, you may be able to automatically generate a schema. +Automatically generated schemas are based on the first event received by the codec. +The schema will only contain keys from this event. +Therefore, you must have all keys present in all events in order for the automatically generated schema to produce a working schema. +Automatically generated schemas make all fields nullable. +Use the sink's `include_keys` and `exclude_keys` configurations to control what data is included in the auto-generated schema. + + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`schema` | Yes | String | The Avro [schema declaration](https://avro.apache.org/docs/current/specification/#schema-declaration). Not required if `auto_schema` is set to true. +`auto_schema` | No | Boolean | When set to `true`, automatically generates the Avro [schema declaration](https://avro.apache.org/docs/current/specification/#schema-declaration) from the first event. + + +### ndjson codec + +The `ndjson` codec writes each line as a JSON object. + +The `ndjson` codec does not take any configurations. + + +### json codec + +The `json` codec writes events in a single large JSON file. +Each event is written into an object within a JSON array. + + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`key_name` | No | String | The name of the key for the JSON array. By default this is `events`. + + +### parquet codec + +The `parquet` codec writes events into a Parquet file. +When using the Parquet codec, set the `buffer_type` to `in_memory`. + +The Parquet codec writes data using the Avro schema. +Because Parquet requires an Avro schema, you may either define the schema yourself, or Data Prepper will automatically generate a schema. +However, we generally recommend that you define your own schema so that it can best meet your needs. + +For details on the Avro schema and recommendations, see the [Avro codec](#avro-codec) documentation. + + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`schema` | Yes | String | The Avro [schema declaration](https://avro.apache.org/docs/current/specification/#schema-declaration). Not required if `auto_schema` is set to true. +`auto_schema` | No | Boolean | When set to `true`, automatically generates the Avro [schema declaration](https://avro.apache.org/docs/current/specification/#schema-declaration) from the first event. + diff --git a/_data-prepper/pipelines/configuration/sinks/sinks.md b/_data-prepper/pipelines/configuration/sinks/sinks.md new file mode 100644 index 00000000..0f3af6ab --- /dev/null +++ b/_data-prepper/pipelines/configuration/sinks/sinks.md @@ -0,0 +1,22 @@ +--- +layout: default +title: Sinks +parent: Pipelines +has_children: true +nav_order: 30 +--- + +# Sinks + +Sinks define where Data Prepper writes your data to. + +## General options for all sink types + +The following table describes options you can use to configure the `sinks` sink. + +Option | Required | Type | Description +:--- | :--- |:------------| :--- +routes | No | String list | A list of routes for which this sink applies. If not provided, this sink receives all events. See [conditional routing]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines#conditional-routing) for more information. +tags_target_key | No | String | When specified, includes event tags in the output of the provided key. +include_keys | No | String list | When specified, provides the keys in this list in the data sent to the sink. Some codecs and sinks do not allow use of this field. +exclude_keys | No | String list | When specified, excludes the keys given from the data sent to the sink. Some codecs and sinks do not allow use of this field. diff --git a/_data-prepper/pipelines/configuration/sinks/stdout.md b/_data-prepper/pipelines/configuration/sinks/stdout.md new file mode 100644 index 00000000..35b1b081 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sinks/stdout.md @@ -0,0 +1,11 @@ +--- +layout: default +title: stdout sink +parent: Sinks +grand_parent: Pipelines +nav_order: 45 +--- + +# stdout sink + +Use the `stdout` sink for console output and testing. It has no configurable options. diff --git a/_data-prepper/pipelines/configuration/sources/dynamo-db.md b/_data-prepper/pipelines/configuration/sources/dynamo-db.md new file mode 100644 index 00000000..597e8351 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/dynamo-db.md @@ -0,0 +1,96 @@ +--- +layout: default +title: dynamodb +parent: Sources +grand_parent: Pipelines +nav_order: 3 +--- + +# dynamodb + +The `dynamodb` source enables change data capture (CDC) on [Amazon DynamoDB](https://aws.amazon.com/dynamodb/) tables. It can receive table events, such as `create`, `update`, or `delete`, using DynamoDB streams and supports initial snapshots using [point-in-time recovery (PITR)](https://aws.amazon.com/dynamodb/pitr/). + +The source includes two ingestion options to stream DynamoDB events: + +1. A _full initial snapshot_ using [PITR](https://aws.amazon.com/dynamodb/pitr/) gets an initial snapshot of the current state of the DynamoDB table. This requires the PITR Snapshots and DyanmoDB option enabled on your DynamoDB table. +2. Stream events from DynamoDB streams without full initial snapshots. This is useful if you already have a snapshot mechanism within your pipelines. This requires that the DynamoDB stream option is enabled on the DynamoDB table. + +## Usage + +The following example pipeline specifies DynamoDB as a source. It ingests data from a DyanmoDB table named `table-a` through a PITR snapshot. It also indicates the `start_position`, which tells the pipeline how to read DynamoDB stream events: + +```yaml +version: "2" +cdc-pipeline: + source: + dynamodb: + tables: + - table_arn: "arn:aws:dynamodb:us-west-2:123456789012:table/table-a" + export: + s3_bucket: "test-bucket" + s3_prefix: "myprefix" + stream: + start_position: "LATEST" # Read latest data from streams (Default) + aws: + region: "us-west-2" + sts_role_arn: "arn:aws:iam::123456789012:role/my-iam-role" +``` + +## Configuration options + +The following tables describe the configuration options for the `dynamodb` source. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`aws` | Yes | AWS | The AWS configuration. See [aws](#aws) for more information. +`acknowledgments` | No | Boolean | When `true`, enables `s3` sources to receive [end-to-end acknowledgments]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines#end-to-end-acknowledgments) when events are received by OpenSearch sinks. +`shared_acknowledgement_timeout` | No | Duration | The amount of time that elapses before the data read from a DynamoDB stream expires when used with acknowledgements. Default is 10 minutes. +`s3_data_file_acknowledgment_timeout` | No | Duration | The amount of time that elapses before the data read from a DynamoDB export expires when used with acknowledgments. Default is 5 minutes. +`tables` | Yes | List | The configuration for the DynamoDB table. See [tables](#tables) for more information. + +### aws + +Use the following options in the AWS configuration. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`region` | No | String | The AWS Region to use for credentials. Defaults to [standard SDK behavior to determine the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon Simple Queue Service (Amazon SQS) and Amazon Simple Storage Service (Amazon S3). Defaults to `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`aws_sts_header_overrides` | No | Map | A map of header overrides that the AWS Identity and Access Management (IAM) role assumes for the sink plugin. + + +### tables + +Use the following options with the `tables` configuration. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`table_arn` | Yes | String | The Amazon Resource Name (ARN) of the source DynamoDB table. +`export` | No | Export | Determines how to export DynamoDB events. For more information, see [export](#export-options). +`stream` | No | Stream | Determines how the pipeline reads data from the DynamoDB table. For more information, see [stream](#stream-option). + +#### Export options + +The following options let you customize the export destination for DynamoDB events. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`s3_bucket` | Yes | String | The destination bucket that stores the exported data files. +`s3_prefix` | No | String | The custom prefix for the S3 bucket. +`s3_sse_kms_key_id` | No | String | An AWS Key Management Service (AWS KMS) key that encrypts the export data files. The `key_id` is the ARN of the KMS key, for example, `arn:aws:kms:us-west-2:123456789012:key/0a4bc22f-bb96-4ad4-80ca-63b12b3ec147`. +`s3_region` | No | String | The Region for the S3 bucket. + +#### Stream option + +The following option lets you customize how the pipeline reads events from the DynamoDB table. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`start_position` | No | String | The position from where the source starts reading stream events when the DynamoDB stream option is enabled. `LATEST` starts reading events from the most recent stream record. + + + + + + + diff --git a/_data-prepper/pipelines/configuration/sources/http-source.md b/_data-prepper/pipelines/configuration/sources/http-source.md new file mode 100644 index 00000000..b41855cd --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/http-source.md @@ -0,0 +1,56 @@ +--- +layout: default +title: http_source +parent: Sources +grand_parent: Pipelines +nav_order: 5 +--- + +# http_source + +`http_source` is a source plugin that supports HTTP. Currently, `http_source` only supports the JSON UTF-8 codec for incoming requests, such as `[{"key1": "value1"}, {"key2": "value2"}]`. The following table describes options you can use to configure the `http_source` source. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +port | No | Integer | The port that the source is running on. Default value is `2021`. Valid options are between `0` and `65535`. +health_check_service | No | Boolean | Enables the health check service on the `/health` endpoint on the defined port. Default value is `false`. +unauthenticated_health_check | No | Boolean | Determines whether or not authentication is required on the health check endpoint. Data Prepper ignores this option if no authentication is defined. Default value is `false`. +request_timeout | No | Integer | The request timeout, in milliseconds. Default value is `10000`. +thread_count | No | Integer | The number of threads to keep in the ScheduledThreadPool. Default value is `200`. +max_connection_count | No | Integer | The maximum allowed number of open connections. Default value is `500`. +max_pending_requests | No | Integer | The maximum allowed number of tasks in the `ScheduledThreadPool` work queue. Default value is `1024`. +authentication | No | Object | An authentication configuration. By default, this creates an unauthenticated server for the pipeline. This uses pluggable authentication for HTTPS. To use basic authentication define the `http_basic` plugin with a `username` and `password`. To provide customer authentication, use or create a plugin that implements [ArmeriaHttpAuthenticationProvider](https://github.com/opensearch-project/data-prepper/blob/1.2.0/data-prepper-plugins/armeria-common/src/main/java/com/amazon/dataprepper/armeria/authentication/ArmeriaHttpAuthenticationProvider.java). +ssl | No | Boolean | Enables TLS/SSL. Default value is false. +ssl_certificate_file | Conditionally | String | SSL certificate chain file path or Amazon Simple Storage Service (Amazon S3) path. Amazon S3 path example `s3:///`. Required if `ssl` is set to true and `use_acm_certificate_for_ssl` is set to false. +ssl_key_file | Conditionally | String | SSL key file path or Amazon S3 path. Amazon S3 path example `s3:///`. Required if `ssl` is set to true and `use_acm_certificate_for_ssl` is set to false. +use_acm_certificate_for_ssl | No | Boolean | Enables a TLS/SSL using certificate and private key from AWS Certificate Manager (ACM). Default value is false. +acm_certificate_arn | Conditionally | String | The ACM certificate Amazon Resource Name (ARN). The ACM certificate takes preference over Amazon S3 or a local file system certificate. Required if `use_acm_certificate_for_ssl` is set to true. +acm_private_key_password | No | String | ACM private key password that decrypts the private key. If not provided, Data Prepper generates a random password. +acm_certificate_timeout_millis | No | Integer | Timeout, in milliseconds, that ACM takes to get certificates. Default value is 120000. +aws_region | Conditionally | String | AWS region used by ACM or Amazon S3. Required if `use_acm_certificate_for_ssl` is set to true or `ssl_certificate_file` and `ssl_key_file` is the Amazon S3 path. + + + +## Metrics + +The `http_source` source includes the following metrics. + +### Counters + +- `requestsReceived`: Measures the total number of requests received by the `/log/ingest` endpoint. +- `requestsRejected`: Measures the total number of requests rejected (429 response status code) by the HTTP Source plugin. +- `successRequests`: Measures the total number of requests successfully processed (200 response status code) the by HTTP Source plugin. +- `badRequests`: Measures the total number of requests with either an invalid content type or format processed by the HTTP Source plugin (400 response status code). +- `requestTimeouts`: Measures the total number of requests that time out in the HTTP source server (415 response status code). +- `requestsTooLarge`: Measures the total number of requests where the size of the event is larger than the buffer capacity (413 response status code). +- `internalServerError`: Measures the total number of requests processed by the HTTP Source with a custom exception type (500 response status code). + +### Timers + +- `requestProcessDuration`: Measures the latency of requests processed by the HTTP Source plugin in seconds. + +### Distribution summaries + +- `payloadSize`: Measures the incoming request payload size in bytes. diff --git a/_data-prepper/pipelines/configuration/sources/kafka.md b/_data-prepper/pipelines/configuration/sources/kafka.md new file mode 100644 index 00000000..4df72cfd --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/kafka.md @@ -0,0 +1,144 @@ +--- +layout: default +title: kafka +parent: Sources +grand_parent: Pipelines +nav_order: 6 +--- + +# kafka + +You can use the Apache Kafka source (`kafka`) in Data Prepper to read records from one or more Kafka [topics](https://kafka.apache.org/intro#intro_concepts_and_terms). These records hold events that your Data Prepper pipeline can ingest. The `kafka` source uses Kafka's [Consumer API](https://kafka.apache.org/documentation/#consumerapi) to consume messages from the Kafka broker, which then creates Data Prepper events for further processing by the Data Prepper pipeline. + +## Usage + +The following example shows the `kafka` source in a Data Prepper pipeline: + +```json +kafka-pipeline: + source: + kafka: + bootstrap_servers: + - 127.0.0.1:9093 + topics: + - name: Topic1 + group_id: groupID1 + - name: Topic2 + group_id: groupID1 +``` + +## Configuration + +Use the following configuration options with the `kafka` source. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`bootstrap_servers` | Yes, when not using Amazon Managed Streaming for Apache Kafka (Amazon MSK) as a cluster. | IP address | The host or port for the initial connection to the Kafka cluster. You can configure multiple Kafka brokers by using the IP address or port number for each broker. When using [Amazon MSK](https://aws.amazon.com/msk/) as your Kafka cluster, the bootstrap server information is obtained from MSK using the MSK Amazon Resource Name (ARN) provided in the configuration. +`topics` | Yes | JSON array | The Kafka topics that the Data Prepper `kafka` source uses to read messages. You can configure up to 10 topics. For more information about `topics` configuration options, see [Topics](#topics). +`schema` | No | JSON object | The schema registry configuration. For more information, see [Schema](#schema). +`authentication` | No | JSON object | Set the authentication options for both the pipeline and Kafka. For more information, see [Authentication](#authentication). +`encryption` | No | JSON object | The encryption configuration. For more information, see [Encryption](#encryption). +`aws` | No | JSON object | The AWS configuration. For more information, see [aws](#aws). +`acknowledgments` | No | Boolean | If `true`, enables the `kafka` source to receive [end-to-end acknowledgments]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines/#end-to-end-acknowledgments) when events are received by OpenSearch sinks. Default is `false`. +`client_dns_lookup` | Yes, when a DNS alias is used. | String | Sets Kafka's `client.dns.lookup` option. Default is `default`. + +### Topics + +Use the following options in the `topics` array. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`name` | Yes | String | The name of each Kafka topic. +`group_id` | Yes | String | Sets Kafka's `group.id` option. +`workers` | No | Integer | The number of multithreaded consumers associated with each topic. Default is `2`. The maximum value is `200`. +`serde_format` | No | String | Indicates the serialization and deserialization format of the messages in the topic. Default is `plaintext`. +`auto_commit` | No | Boolean | When `false`, the consumer's offset will not be periodically committed to Kafka in the background. Default is `false`. +`commit_interval` | No | Integer | When `auto_commit` is set to `true`, sets how frequently, in seconds, the consumer offsets are auto-committed to Kafka through Kafka's `auto.commit.interval.ms` option. Default is `5s`. +`session_timeout` | No | Integer | The amount of time during which the source detects client failures when using Kafka's group management features, which can be used to balance the data stream. Default is `45s`. +`auto_offset_reset` | No | String | Automatically resets the offset to an earlier or the latest offset through Kafka's `auto.offset.reset` option. Default is `latest`. +`thread_waiting_time` | No | Integer | The amount of time that threads wait for the preceding thread to complete its task and to signal the next thread. The Kafka consumer API poll timeout value is set to half of this setting. Default is `5s`. +`max_partition_fetch_bytes` | No | Integer | Sets the maximum limit in megabytes for max data returns from each partition through Kafka's `max.partition.fetch.bytes` setting. Default is `1mb`. +`heart_beat_interval` | No | Integer | The expected amount of time between heartbeats to the consumer coordinator when using Kafka's group management facilities through Kafka's `heartbeat.interval.ms` setting. Default is `5s`. +`fetch_max_wait` | No | Integer | The maximum amount of time during which the server blocks a fetch request when there isn't sufficient data to satisfy the `fetch_min_bytes` requirement through Kafka's `fetch.max.wait.ms` setting. Default is `500ms`. +`fetch_max_bytes` | No | Integer | The maximum record size accepted by the broker through Kafka's `fetch.max.bytes` setting. Default is `50mb`. +`fetch_min_bytes` | No | Integer | The minimum amount of data the server returns during a fetch request through Kafka's `retry.backoff.ms` setting. Default is `1b`. +`retry_backoff` | No | Integer | The amount of time to wait before attempting to retry a failed request to a given topic partition. Default is `10s`. +`max_poll_interval` | No | Integer | The maximum delay between invocations of a `poll()` when using group management through Kafka's `max.poll.interval.ms` option. Default is `300s`. +`consumer_max_poll_records` | No | Integer | The maximum number of records returned in a single `poll()` call through Kafka's `max.poll.records` setting. Default is `500`. +`key_mode` | No | String | Indicates how the key field of the Kafka message should be handled. The default setting is `include_as_field`, which includes the key in the `kafka_key` event. The `include_as_metadata` setting includes the key in the event's metadata. The `discard` setting discards the key. + +### Schema + +The following option is required inside the `schema` configuration. + +Option | Type | Description +:--- | :--- | :--- +`type` | String | Sets the type of schema based on your registry, either the AWS Glue Schema Registry, `aws_glue`, or the Confluent Schema Registry, `confluent`. When using the `aws_glue` registry, set any [AWS](#aws) configuration options. + +The following configuration options are only required when using a `confluent` registry. + +Option | Type | Description +:--- | :--- | :--- +`registry_url` | String | Deserializes a record value from a `bytearray` into a string. Default is `org.apache.kafka.common.serialization.StringDeserializer`. +`version` | String | Deserializes a record key from a `bytearray` into a string. Default is `org.apache.kafka.common.serialization.StringDeserializer`. +`schema_registry_api_key` | String | The schema registry API key. +`schema_registry_api_secret` | String | The schema registry API secret. + +### Authentication + +The following option is required inside the `authentication` object. + +Option | Type | Description +:--- | :--- | :--- +`sasl` | JSON object | The Simple Authentication and Security Layer (SASL) authentication configuration. + +### SASL + +Use one of the following options when configuring SASL authentication. + + +Option | Type | Description +:--- | :--- | :--- +`plaintext` | JSON object | The [PLAINTEXT](#sasl-plaintext) authentication configuration. +`aws_msk_iam` | String | The Amazon MSK AWS Identity and Access Management (IAM) configuration. If set to `role`, the `sts_role_arm` set in the `aws` configuration is used. Default is `default`. + + + +#### SASL PLAINTEXT + +The following options are required when using the [SASL PLAINTEXT](https://kafka.apache.org/10/javadoc/org/apache/kafka/common/security/auth/SecurityProtocol.html) protocol. + +Option | Type | Description +:--- | :--- | :--- +`username` | String | The username for the PLAINTEXT auth. +`password` | String | The password for the PLAINTEXT auth. + +#### Encryption + +Use the following options when setting SSL encryption. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`type` | No | String | The encryption type. Use `none` to disable encryption. Default is `ssl`. +`Insecure` | No | Boolean | A Boolean flag used to turn off SSL certificate verification. If set to `true`, certificate authority (CA) certificate verification is turned off and insecure HTTP requests are sent. Default is `false`. + + +#### AWS + +Use the following options when setting up authentication for `aws` services. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`region` | No | String | The AWS Region to use for credentials. Defaults to [standard SDK behavior to determine the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon Simple Queue Service (Amazon SQS) and Amazon Simple Storage Service (Amazon S3). Default is `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`msk` | No | JSON object | The [MSK](#msk) configuration settings. + +#### MSK + +Use the following options inside the `msk` object. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`arn` | Yes | String | The [MSK ARN](https://docs.aws.amazon.com/msk/1.0/apireference/configurations-arn.html) to use. +`broker_connection_type` No | String | The type of connector to use with the MSK broker, either `public`, `single_vpc`, or `multip_vpc`. Default is `single_vpc`. + diff --git a/_data-prepper/pipelines/configuration/sources/opensearch.md b/_data-prepper/pipelines/configuration/sources/opensearch.md new file mode 100644 index 00000000..7cc0b9a3 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/opensearch.md @@ -0,0 +1,369 @@ +--- +layout: default +title: opensearch +parent: Sources +grand_parent: Pipelines +nav_order: 30 +--- + +# opensearch + +The `opensearch` source plugin is used to read indexes from an OpenSearch cluster, a legacy Elasticsearch cluster, an Amazon OpenSearch Service domain, or an Amazon OpenSearch Serverless collection. + +The plugin supports OpenSearch 2.x and Elasticsearch 7.x. + +## Usage + +To use the `opensearch` source with the minimum required settings, add the following configuration to your `pipeline.yaml` file: + +```yaml +opensearch-source-pipeline: + source: + opensearch: + hosts: [ "https://localhost:9200" ] + username: "username" + password: "password" + ... +``` + +To use the `opensearch` source with all configuration settings, including `indices`, `scheduling`, `search_options`, and `connection`, add the following example to your `pipeline.yaml` file: + +```yaml +opensearch-source-pipeline: + source: + opensearch: + hosts: [ "https://localhost:9200" ] + username: "username" + password: "password" + indices: + include: + - index_name_regex: "test-index-.*" + exclude: + - index_name_regex: "\..*" + scheduling: + interval: "PT1H" + index_read_count: 2 + start_time: "2023-06-02T22:01:30.00Z" + search_options: + search_context_type: "none" + batch_size: 1000 + connection: + insecure: false + cert: "/path/to/cert.crt" + ... +``` + +## Amazon OpenSearch Service + +The `opensearch` source can be configured for an Amazon OpenSearch Service domain by passing an `sts_role_arn` with access to the domain, as shown in the following example: + +```yaml +opensearch-source-pipeline: + source: + opensearch: + hosts: [ "https://search-my-domain-soopywaovobopgs8ywurr3utsu.us-east-1.es.amazonaws.com" ] + aws: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/my-domain-role" + ... +``` + +## Amazon OpenSearch Serverless + +The `opensearch` source can be configured with Amazon OpenSearch Serverless by setting the `serverless` option to `true`, as shown in the following example: + +```yaml + - opensearch: + hosts: [ 'https://1234567890abcdefghijkl.us-west-2.aoss.amazonaws.com' ] + aws: + sts_role_arn: 'arn:aws:iam::123456789012:role/my-domain-role' + region: 'us-west-2' + serverless: true +``` + + +## Using metadata + +When the `opensource` source constructs Data Prepper events from documents in the cluster, the document index is stored in the EventMetadata with an `opensearch-index` key, and the document_id is stored in the `EventMetadata` with the `opensearch-document_id` as the key. This allows for conditional routing based on the index or `document_id`. The following example pipeline configuration sends events to an `opensearch` sink and uses the same index and `document_id` from the source cluster as in the destination cluster: + + +```yaml +opensearch-migration-pipeline: + source: + opensearch: + hosts: [ "https://source-cluster:9200" ] + username: "username" + password: "password" + sink: + - opensearch: + hosts: [ "https://sink-cluster:9200" ] + username: "username" + password: "password" + document_id: "${getMetadata(\"opensearch-document_id\")}" + index: "${getMetadata(\"opensearch-index\"}" +``` + +## Configuration options + + +The following table describes options you can configure for the `opensearch` source. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`hosts` | Yes | List | A list of OpenSearch hosts to write to, for example, `["https://localhost:9200", "https://remote-cluster:9200"]`. +`username` | No | String | The username for HTTP basic authentication. Since Data Prepper 2.5, this setting can be refreshed at runtime if [AWS secrets reference]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/configuring-data-prepper/#reference-secrets) is applied. +`password` | No | String | The password for HTTP basic authentication. Since Data Prepper 2.5, this setting can be refreshed at runtime if [AWS secrets reference]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/configuring-data-prepper/#reference-secrets) is applied. +`disable_authentication` | No | Boolean | Whether authentication is disabled. Defaults to `false`. +`aws` | No | Object | The AWS configuration. For more information, see [aws](#aws). +`acknowledgments` | No | Boolean | When `true`, enables the `opensearch` source to receive [end-to-end acknowledgments]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines/#end-to-end-acknowledgments) when events are received by OpenSearch sinks. Default is `false`. +`connection` | No | Object | The connection configuration. For more information, see [Connection](#connection). +`indices` | No | Object | The configuration for filtering which indexes are processed. Defaults to all indexes, including system indexes. For more information, see [indexes](#indices). +`scheduling` | No | Object | The scheduling configuration. For more information, see [Scheduling](#scheduling). +`search_options` | No | Object | A list of search options performed by the source. For more information, see [Search options](#search_options). +`serverless` | No | Boolean | Determines whether the OpenSearch backend is Amazon OpenSearch Serverless. Set this value to `true` when the destination for the `opensearch` source is an Amazon OpenSearch Serverless collection. Default is `false`. +`serverless_options` | No | Object | The network configuration options available when the backend of the `opensearch` source is set to Amazon OpenSearch Serverless. For more information, see [Serverless options](#serverless-options). + +### Serverless options + +The following options can be used in the `serverless_options` object. + +Option | Required | Type | Description +:--- | :--- | :---| :--- +`network_policy_name` | Yes | String | The name of the network policy to create. +`collection_name` | Yes | String | The name of the Amazon OpenSearch Serverless collection to configure. +`vpce_id` | Yes | String | The virtual private cloud (VPC) endpoint to which the source connects. + +### Scheduling + +The `scheduling` configuration allows the user to configure how indexes are reprocessed in the source based on the the `index_read_count` and recount time `interval`. + +For example, setting `index_read_count` to `3` with an `interval` of `1h` will result in all indexes being reprocessed 3 times, 1 hour apart. By default, indexes will only be processed once. + +Use the following options under the `scheduling` configuration. + +Option | Required | Type | Description +:--- | :--- |:----------------| :--- +`index_read_count` | No | Integer | The number of times each index will be processed. Default is `1`. +`interval` | No | String | The interval that determines the amount of time between reprocessing. Supports ISO 8601 notation strings, such as "PT20.345S" or "PT15M", as well as simple notation strings for seconds ("60s") and milliseconds ("1500ms"). Defaults to `8h`. +`start_time` | No | String | The time when processing should begin. The source will not start processing until this time. The string must be in ISO 8601 format, such as `2007-12-03T10:15:30.00Z`. The default option starts processing immediately. + + +### indices + +The following options help the `opensearch` source determine which indexes are processed from the source cluster using regex patterns. An index will only be processed if it matches one of the `index_name_regex` patterns under the `include` setting and does not match any of the +patterns under the `exclude` setting. + +Option | Required | Type | Description +:--- | :--- |:-----------------| :--- +`include` | No | Array of objects | A list of index configuration patterns that specifies which indexes will be processed. +`exclude` | No | Array of Objects | A list of index configuration patterns that specifies which indexes will not be processed. For example, you can specify an `index_name_regex` pattern of `\..*` to exclude system indexes. + + +Use the following setting under the `include` and `exclude` options to indicate the regex pattern for the index. + +Option | Required | Type | Description +:--- |:----|:-----------------| :--- +`index_name_regex` | Yes | Regex string | The regex pattern to match indexes against. + +### search_options + +Use the following settings under the `search_options` configuration. + +Option | Required | Type | Description +:--- |:---------|:--------| :--- +`batch_size` | No | Integer | The number of documents to read while paginating from OpenSearch. Default is `1000`. +`search_context_type` | No | Enum | An override for the type of search/pagination to use on indexes. Can be [point_in_time]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#point-in-time-with-search_after)), [scroll]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#scroll-search), or `none`. The `none` option will use the [search_after]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#the-search_after-parameter) parameter. For more information, see [Default Search Behavior](#default-search-behavior). + +### Default search behavior + +By default, the `opensearch` source will look up the cluster version and distribution to determine +which `search_context_type` to use. For versions and distributions that support [Point in Time](https://opensearch.org/docs/latest/search-plugins/searching-data/paginate/#point-in-time-with-search_after), `point_in_time` will be used. +If `point_in_time` is not supported by the cluster, then [scroll](https://opensearch.org/docs/latest/search-plugins/searching-data/paginate/#scroll-search) will be used. For Amazon OpenSearch Serverless collections, [search_after](https://opensearch.org/docs/latest/search-plugins/searching-data/paginate/#the-search_after-parameter) will be used because neither `point_in_time` nor `scroll` are supported by collections. + +### Connection + +Use the following settings under the `connection` configuration. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`cert` | No | String | The path to the security certificate, for example, `"config/root-ca.pem"`, when the cluster uses the OpenSearch Security plugin. +`insecure` | No | Boolean | Whether or not to verify SSL certificates. If set to `true`, the certificate authority (CA) certificate verification is disabled and insecure HTTP requests are sent. Default is `false`. + + +### AWS + +Use the following options when setting up authentication for `aws` services. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`region` | No | String | The AWS Region to use for credentials. Defaults to [standard SDK behavior to determine the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon OpenSearch Service and Amazon OpenSearch Serverless. Default is `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`serverless` | No | Boolean | Should be set to `true` when processing from an Amazon OpenSearch Serverless collection. Defaults to `false`. + +## Metrics + +The `opensearch` source includes the following metrics. + +### Counters + +- `documentsProcessed`: Measures the total number of documents processed by the `opensearch` source plugin. +- `indicesProcessed`: Measures the total number of indexes processed by the `opensearch` source plugin. +- `processingErrors`: Measures the total number of index processing errors made by the `opensearch` source plugin. +- `credentialsChanged`: Measures the number of times that the `opensearch` source refreshes basic credentials (username/password). +- `clientRefreshErrors`: Measures the number of errors encountered when generating a new client due to the `opensearch` source refreshing basic credentials. + +### Timers + +- `indexProcessingTime`: Measures the `opensearch` source plugin index processing latency, in seconds. + +### Distribution summaries + +- `bytesReceived`: Measures the size distribution of incoming documents, in bytes, received by the `opensearch` source plugin. +- `bytesProcessed`: Measures the size distribution of incoming document, in bytes, successfully processed by the `opensearch` source plugin. + +## OpenSearch cluster security + +In order to pull data from an OpenSearch cluster using the `opensearch` source plugin, you must specify your username and password within the pipeline configuration. The following example `pipeline.yaml` file demonstrates how to specify the default admin security credentials: + +```yaml +source: + opensearch: + username: "admin" + password: "admin" + ... +``` + +### Amazon OpenSearch Service domain security + +The `opensearch` source plugin can pull data from an [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/what-is.html) domain, which uses AWS Identity and Access Management (IAM) for security. The plugin uses the default Amazon OpenSearch Service credential chain. Run `aws configure` using the [AWS Command Line Interface (AWS CLI)](https://aws.amazon.com/cli/) to set your credentials. + +Make sure the credentials that you configure have the required IAM permissions. The following domain access policy shows the minimum required permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam:::user/data-prepper-user" + }, + "Action": "es:ESHttpGet", + "Resource": [ + "arn:aws:es:us-east-1::domain//", + "arn:aws:es:us-east-1::domain//_cat/indices", + "arn:aws:es:us-east-1::domain//_search", + "arn:aws:es:us-east-1::domain//_search/scroll", + "arn:aws:es:us-east-1::domain//*/_search" + ] + }, + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam:::user/data-prepper-user" + }, + "Action": "es:ESHttpPost", + "Resource": [ + "arn:aws:es:us-east-1::domain//*/_search/point_in_time", + "arn:aws:es:us-east-1::domain//*/_search/scroll" + ] + }, + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam:::user/data-prepper-user" + }, + "Action": "es:ESHttpDelete", + "Resource": [ + "arn:aws:es:us-east-1::domain//_search/point_in_time", + "arn:aws:es:us-east-1::domain//_search/scroll" + ] + } + ] +} +``` + +For instructions on how to configure the domain access policy, see [Resource-based policies +](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/ac.html#ac-types-resource) in the Amazon OpenSearch Service documentation. + +### OpenSearch Serverless collection security + +The `opensearch` source plugin can receive data from an [Amazon OpenSearch Serverless](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless.html) collection. + +You cannot read from a collection that uses virtual private cloud (VPC) access. The collection must be accessible from public networks. +{: .warning} + +#### Creating a pipeline role + +To use OpenSearch Serverless collection security, create an IAM role that the pipeline will assume in order to read from the collection. The role must have the following minimum permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "aoss:APIAccessAll" + ], + "Resource": "arn:aws:aoss:*::collection/*" + } + ] +} +``` + +#### Creating a collection + +Next, create a collection with the following settings: + +- Public [network access](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-network.html) to both the OpenSearch endpoint and OpenSearch Dashboards. +- The following [data access policy](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-data-access.html), which grants the required permissions to the pipeline role, as shown in the following configuration: + + ```json + [ + { + "Rules":[ + { + "Resource":[ + "index/collection-name/*" + ], + "Permission":[ + "aoss:ReadDocument", + "aoss:DescribeIndex" + ], + "ResourceType":"index" + } + ], + "Principal":[ + "arn:aws:iam:::role/PipelineRole" + ], + "Description":"Pipeline role access" + } + ] + ``` + +Make sure to replace the Amazon Resource Name (ARN) in the `Principal` element with the ARN of the pipeline role that you created in the preceding step. +{: .tip} + +For instructions on how to create collections, see [Creating collections](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-manage.html#serverless-create) in the Amazon OpenSearch Service documentation. + +#### Creating a pipeline + +Within your `pipeline.yaml` file, specify the OpenSearch Serverless collection endpoint as the `hosts` option. In addition, you must set the `serverless` option to `true`. Specify the pipeline role in the `sts_role_arn` option, as shown in the following example: + +```yaml +opensearch-source-pipeline: + source: + opensearch: + hosts: [ "https://" ] + aws: + serverless: true + sts_role_arn: "arn:aws:iam:::role/PipelineRole" + region: "us-east-1" + processor: + - date: + from_time_received: true + destination: "@timestamp" + sink: + - stdout: +``` diff --git a/_data-prepper/pipelines/configuration/sources/otel-logs-source.md b/_data-prepper/pipelines/configuration/sources/otel-logs-source.md new file mode 100644 index 00000000..58d8a2b0 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/otel-logs-source.md @@ -0,0 +1,67 @@ +--- +layout: default +title: otel_logs_source +parent: Sources +grand_parent: Pipelines +nav_order: 25 +--- + +# otel_logs_source + + +The `otel_logs_source` source is an OpenTelemetry source that follows the [OpenTelemetry Protocol Specification](https://github.com/open-telemetry/oteps/blob/master/text/0035-opentelemetry-protocol.md) and receives logs from the OTel Collector in the form of `ExportLogsServiceRequest` records. + +This source supports the `OTLP/gRPC` protocol. +{: .note} + +## Configuration + +You can configure the `otel_logs_source` source with the following options. + +| Option | Type | Description | +| :--- | :--- | :--- | +| port | int | Represents the port that the `otel_logs_source` source is running on. Default value is `21892`. | +| path | string | Represents the path for sending unframed HTTP requests. You can use this option to support an unframed gRPC request with an HTTP idiomatic path to a configurable path. The path should start with `/`, and its length should be at least 1. The `/opentelemetry.proto.collector.logs.v1.LogsService/Export` endpoint is disabled for both gRPC and HTTP requests if the path is configured. The path can contain a `${pipelineName}` placeholder, which is replaced with the pipeline name. If the value is empty and `unframed_requests` is `true`, then the path that the source provides is `/opentelemetry.proto.collector.logs.v1.LogsService/Export`. | +| request_timeout | int | Represents the request timeout duration in milliseconds. Default value is `10000`. | +| health_check_service | Boolean | Enables the gRPC health check service under `grpc.health.v1/Health/Check`. Default value is `false`. | +| proto_reflection_service | Boolean | Enables a reflection service for Protobuf services (see [ProtoReflectionService](https://grpc.github.io/grpc-java/javadoc/io/grpc/protobuf/services/ProtoReflectionService.html) and [gRPC reflection](https://github.com/grpc/grpc-java/blob/master/documentation/server-reflection-tutorial.md)). Default value is `false`. | +| unframed_requests | Boolean | Enables requests that are not framed using the gRPC wire protocol. Default value is `false`. | +| thread_count | int | The number of threads to keep in the `ScheduledThreadPool`. Default value is `500`. | +| max_connection_count | int | The maximum number of open connections allowed. Default value is `500`. | + +### SSL + +You can configure SSL in the `otel_logs_source` source with the following options. + +| Option | Type | Description | +| :--- | :--- | :--- | +| ssl | Boolean | Enables TLS/SSL. Default value is `true`. | +| sslKeyCertChainFile | string | Represents the SSL certificate chain file path or Amazon Simple Storage Service (Amazon S3) path. For example, see the Amazon S3 path `s3:///`. Required if `ssl` is set to `true`. | +| sslKeyFile | string | Represents the SSL key file path or Amazon S3 path. For example, see the Amazon S3 path `s3:///`. Required if `ssl` is set to `true`. | +| useAcmCertForSSL | Boolean | Enables TLS/SSL using a certificate and private key from AWS Certificate Manager (ACM). Default value is `false`. | +| acmCertificateArn | string | Represents the ACM certificate Amazon Resource Name (ARN). ACM certificates take precedence over Amazon S3 or local file system certificates. Required if `useAcmCertForSSL` is set to `true`. | +| awsRegion | string | Represents the AWS Region used by ACM or Amazon S3. Required if `useAcmCertForSSL` is set to `true` or `sslKeyCertChainFile` or `sslKeyFile` is the Amazon S3 path. | + +## Usage + +To get started, create a `pipeline.yaml` file and add `otel_logs_source` as the source: + +``` +source: + - otel_logs_source: +``` + +## Metrics + +You can use the following metrics with the `otel_logs_source` source. + +| Option | Type | Description | +| :--- | :--- | :--- | +| `requestTimeouts` | Counter | Measures the total number of requests that time out. | +| `requestsReceived` | Counter | Measures the total number of requests received by the `otel_logs_source` source. | +| `badRequests` | Counter | Measures the total number of requests that could not be parsed. | +| `requestsTooLarge` | Counter | Measures the total number of requests that exceed the maximum allowed size. Indicates that the size of the data being written into the buffer is beyond the buffer's maximum capacity. | +| `internalServerError` | Counter | Measures the total number of requests that are erroneous due to errors other than `requestTimeouts` or `requestsTooLarge`. | +| `successRequests` | Counter | Measures the total number of requests successfully written to the buffer. | +| `payloadSize` | Distribution summary | Measures the distribution of all incoming payload sizes. | +| `requestProcessDuration` | Timer | Measures the duration of request processing. | \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md b/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md new file mode 100644 index 00000000..03019635 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md @@ -0,0 +1,42 @@ +--- +layout: default +title: otel_metrics_source +parent: Sources +grand_parent: Pipelines +nav_order: 10 +--- + +# otel_metrics_source + +`otel_metrics_source` is an OpenTelemetry Collector source that collects metric data. The following table describes options you can use to configure the `otel_metrics_source` source. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +port | No | Integer | The port that the OpenTelemtry metrics source runs on. Default value is `21891`. +request_timeout | No | Integer | The request timeout, in milliseconds. Default value is `10000`. +health_check_service | No | Boolean | Enables a gRPC health check service under `grpc.health.v1/Health/Check`. Default value is `false`. +proto_reflection_service | No | Boolean | Enables a reflection service for Protobuf services (see [gRPC reflection](https://github.com/grpc/grpc/blob/master/doc/server-reflection.md) and [gRPC Server Reflection Tutorial](https://github.com/grpc/grpc-java/blob/master/documentation/server-reflection-tutorial.md) docs). Default value is `false`. +unframed_requests | No | Boolean | Enables requests not framed using the gRPC wire protocol. +thread_count | No | Integer | The number of threads to keep in the `ScheduledThreadPool`. Default value is `200`. +max_connection_count | No | Integer | The maximum allowed number of open connections. Default value is `500`. +ssl | No | Boolean | Enables connections to the OpenTelemetry source port over TLS/SSL. Default value is `true`. +sslKeyCertChainFile | Conditionally | String | File-system path or Amazon Simple Storage Service (Amazon S3) path to the security certificate (for example, `"config/demo-data-prepper.crt"` or `"s3://my-secrets-bucket/demo-data-prepper.crt"`). Required if `ssl` is set to `true`. +sslKeyFile | Conditionally | String | File-system path or Amazon S3 path to the security key (for example, `"config/demo-data-prepper.key"` or `"s3://my-secrets-bucket/demo-data-prepper.key"`). Required if `ssl` is set to `true`. +useAcmCertForSSL | No | Boolean | Whether to enable TLS/SSL using a certificate and private key from AWS Certificate Manager (ACM). Default value is `false`. +acmCertificateArn | Conditionally | String | Represents the ACM certificate ARN. ACM certificate take preference over S3 or local file system certificates. Required if `useAcmCertForSSL` is set to `true`. +awsRegion | Conditionally | String | Represents the AWS Region used by ACM or Amazon S3. Required if `useAcmCertForSSL` is set to `true` or `sslKeyCertChainFile` and `sslKeyFile` is the Amazon S3 path. +authentication | No | Object | An authentication configuration. By default, an unauthenticated server is created for the pipeline. This uses pluggable authentication for HTTPS. To use basic authentication, define the `http_basic` plugin with a `username` and `password`. To provide customer authentication, use or create a plugin that implements [GrpcAuthenticationProvider](https://github.com/opensearch-project/data-prepper/blob/1.2.0/data-prepper-plugins/armeria-common/src/main/java/com/amazon/dataprepper/armeria/authentication/GrpcAuthenticationProvider.java). + + + +## Metrics + +The `otel_metrics_source` source includes the following metrics. + +### Counters + +- `requestTimeouts`: Measures the total number of requests that time out. +- `requestsReceived`: Measures the total number of requests received by the OpenTelemetry metrics source. + diff --git a/_data-prepper/pipelines/configuration/sources/otel-trace.md b/_data-prepper/pipelines/configuration/sources/otel-trace.md new file mode 100644 index 00000000..4b176477 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/otel-trace.md @@ -0,0 +1,53 @@ +--- +layout: default +title: otel_trace_source source +parent: Sources +grand_parent: Pipelines +nav_order: 15 +--- + + +# otel_trace source + +## Overview + +The `otel_trace` source is a source for the OpenTelemetry Collector. The following table describes options you can use to configure the `otel_trace` source. + + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +port | No | Integer | The port that the `otel_trace` source runs on. Default value is `21890`. +request_timeout | No | Integer | The request timeout, in milliseconds. Default value is `10000`. +health_check_service | No | Boolean | Enables a gRPC health check service under `grpc.health.v1/Health/Check`. Default value is `false`. +unauthenticated_health_check | No | Boolean | Determines whether or not authentication is required on the health check endpoint. Data Prepper ignores this option if no authentication is defined. Default value is `false`. +proto_reflection_service | No | Boolean | Enables a reflection service for Protobuf services (see [gRPC reflection](https://github.com/grpc/grpc/blob/master/doc/server-reflection.md) and [gRPC Server Reflection Tutorial](https://github.com/grpc/grpc-java/blob/master/documentation/server-reflection-tutorial.md) docs). Default value is `false`. +unframed_requests | No | Boolean | Enable requests not framed using the gRPC wire protocol. +thread_count | No | Integer | The number of threads to keep in the ScheduledThreadPool. Default value is `200`. +max_connection_count | No | Integer | The maximum allowed number of open connections. Default value is `500`. +ssl | No | Boolean | Enables connections to the OTel source port over TLS/SSL. Defaults to `true`. +sslKeyCertChainFile | Conditionally | String | File system path or Amazon Simple Storage Service (Amazon S3) path to the security certificate (for example, `"config/demo-data-prepper.crt"` or `"s3://my-secrets-bucket/demo-data-prepper.crt"`). Required if `ssl` is set to `true`. +sslKeyFile | Conditionally | String | File system path or Amazon S3 path to the security key (for example, `"config/demo-data-prepper.key"` or `"s3://my-secrets-bucket/demo-data-prepper.key"`). Required if `ssl` is set to `true`. +useAcmCertForSSL | No | Boolean | Whether to enable TLS/SSL using a certificate and private key from AWS Certificate Manager (ACM). Default value is `false`. +acmCertificateArn | Conditionally | String | Represents the ACM certificate ARN. ACM certificate take preference over S3 or local file system certificate. Required if `useAcmCertForSSL` is set to `true`. +awsRegion | Conditionally | String | Represents the AWS region used by ACM or Amazon S3. Required if `useAcmCertForSSL` is set to `true` or `sslKeyCertChainFile` and `sslKeyFile` are Amazon S3 paths. +authentication | No | Object | An authentication configuration. By default, an unauthenticated server is created for the pipeline. This parameter uses pluggable authentication for HTTPS. To use basic authentication, define the `http_basic` plugin with a `username` and `password`. To provide customer authentication, use or create a plugin that implements [GrpcAuthenticationProvider](https://github.com/opensearch-project/data-prepper/blob/1.2.0/data-prepper-plugins/armeria-common/src/main/java/com/amazon/dataprepper/armeria/authentication/GrpcAuthenticationProvider.java). + + +## Metrics + +### Counters + +- `requestTimeouts`: Measures the total number of requests that time out. +- `requestsReceived`: Measures the total number of requests received by the `otel_trace` source. +- `successRequests`: Measures the total number of requests successfully processed by the `otel_trace` source plugin. +- `badRequests`: Measures the total number of requests with an invalid format processed by the `otel_trace` source plugin. +- `requestsTooLarge`: Measures the total number of requests whose number of spans exceeds the buffer capacity. +- `internalServerError`: Measures the total number of requests processed by the `otel_trace` source with a custom exception type. + +### Timers + +- `requestProcessDuration`: Measures the latency of requests processed by the `otel_trace` source plugin in seconds. + +### Distribution summaries + +- `payloadSize`: Measures the incoming request payload size distribution in bytes. \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/sources/s3.md b/_data-prepper/pipelines/configuration/sources/s3.md new file mode 100644 index 00000000..7dc31caa --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/s3.md @@ -0,0 +1,306 @@ +--- +layout: default +title: s3 source +parent: Sources +grand_parent: Pipelines +nav_order: 20 +--- + +# s3 source + +`s3` is a source plugin that reads events from [Amazon Simple Storage Service (Amazon S3)](https://aws.amazon.com/s3/) objects. It requires an [Amazon Simple Queue Service (Amazon SQS)](https://aws.amazon.com/sqs/) queue that receives [S3 Event Notifications](https://docs.aws.amazon.com/AmazonS3/latest/userguide/NotificationHowTo.html). After Amazon SQS is configured, the `s3` source receives messages from Amazon SQS. When the SQS message indicates that an S3 object was created, the `s3` source loads the S3 objects and then parses them using the configured [codec](#codec). You can also configure the `s3` source to use [Amazon S3 Select](https://docs.aws.amazon.com/AmazonS3/latest/userguide/selecting-content-from-objects.html) instead of Data Prepper to parse S3 objects. + +## IAM permissions + +In order to use the `s3` source, configure your AWS Identity and Access Management (IAM) permissions to grant Data Prepper access to Amazon S3. You can use a configuration similar to the following JSON configuration: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "s3-access", + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:ListBucket", + "s3:DeleteObject" + ], + "Resource": "arn:aws:s3:::/*" + }, + { + "Sid": "sqs-access", + "Effect": "Allow", + "Action": [ + "sqs:ChangeMessageVisibility", + "sqs:DeleteMessage", + "sqs:ReceiveMessage" + ], + "Resource": "arn:aws:sqs::<123456789012>:" + }, + { + "Sid": "kms-access", + "Effect": "Allow", + "Action": "kms:Decrypt", + "Resource": "arn:aws:kms::<123456789012>:key/" + } + ] +} +``` + +If your S3 objects or Amazon SQS queues do not use [AWS Key Management Service (AWS KMS)](https://aws.amazon.com/kms/), remove the `kms:Decrypt` permission. + +If you do not enable `visibility_duplication_protection`, you can remove the `sqs:ChangeMessageVisibility` permission from the SQS queue's access. + +## Cross-account S3 access + +When Data Prepper fetches data from an S3 bucket, it verifies the ownership of the bucket using the +[bucket owner condition](https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucket-owner-condition.html). +By default, Data Prepper expects an S3 bucket to be owned by the same that owns the correlating SQS queue. +When no SQS is provided, Data Prepper uses the Amazon Resource Name (ARN) role in the `aws` configuration. + +If you plan to ingest data from multiple S3 buckets but each bucket is associated with a different S3 account, you need to configure Data Prepper to check for cross-account S3 access, according to the following conditions: + +- If all S3 buckets you want data from belong to an account other than that of the SQS queue, set `default_bucket_owner` to the account ID of the bucket account holder. +- If your S3 buckets are in multiple accounts, use a `bucket_owners` map. + +In the following example, the SQS queue is owned by account `000000000000`. The SQS queue contains data from two S3 buckets: `my-bucket-01` and `my-bucket-02`. +Because `my-bucket-01` is owned by `123456789012` and `my-bucket-02` is owned by `999999999999`, the `bucket_owners` map calls both bucket owners with their account IDs, as shown in the following configuration: + +``` +s3: + sqs: + queue_url: "https://sqs.us-east-1.amazonaws.com/000000000000/MyQueue" + bucket_owners: + my-bucket-01: 123456789012 + my-bucket-02: 999999999999 +``` + +You can use both `bucket_owners` and `default_bucket_owner` together. + +## Configuration + +You can use the following options to configure the `s3` source. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`notification_type` | Yes | String | Must be `sqs`. +`notification_source` | No | String | Determines how notifications are received by SQS. Must be `s3` or `eventbridge`. `s3` represents notifications that are directly sent from Amazon S3 to Amazon SQS or fanout notifications from Amazon S3 to Amazon Simple Notification Service (Amazon SNS) to Amazon SQS. `eventbridge` represents notifications from [Amazon EventBridge](https://aws.amazon.com/eventbridge/) and [Amazon Security Lake](https://aws.amazon.com/security-lake/). Default is `s3`. +`compression` | No | String | The compression algorithm to apply: `none`, `gzip`, or `automatic`. Default is `none`. +`codec` | Yes | Codec | The [codec](#codec) to apply. +`sqs` | Yes | SQS | The SQS configuration. See [sqs](#sqs) for more information. +`aws` | Yes | AWS | The AWS configuration. See [aws](#aws) for more information. +`on_error` | No | String | Determines how to handle errors in Amazon SQS. Can be either `retain_messages` or `delete_messages`. `retain_messages` leaves the message in the Amazon SQS queue and tries to send the message again. This is recommended for dead-letter queues. `delete_messages` deletes failed messages. Default is `retain_messages`. +buffer_timeout | No | Duration | The amount of time allowed for writing events to the Data Prepper buffer before timeout occurs. Any events that the Amazon S3 source cannot write to the buffer during the set amount of time are discarded. Default is `10s`. +`records_to_accumulate` | No | Integer | The number of messages that accumulate before being written to the buffer. Default is `100`. +`metadata_root_key` | No | String | The base key for adding S3 metadata to each event. The metadata includes the key and bucket for each S3 object. Default is `s3/`. +`disable_bucket_ownership_validation` | No | Boolean | When `true`, the S3 source does not attempt to validate that the bucket is owned by the expected account. The expected account is the same account that owns the Amazon SQS queue. Default is `false`. +`acknowledgments` | No | Boolean | When `true`, enables `s3` sources to receive [end-to-end acknowledgments]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines#end-to-end-acknowledgments) when events are received by OpenSearch sinks. +`s3_select` | No | [s3_select](#s3_select) | The Amazon S3 Select configuration. +`scan` | No | [scan](#scan) | The S3 scan configuration. +`delete_s3_objects_on_read` | No | Boolean | When `true`, the S3 scan attempts to delete S3 objects after all events from the S3 object are successfully acknowledged by all sinks. `acknowledgments` should be enabled when deleting S3 objects. Default is `false`. + + +## sqs + +The following parameters allow you to configure usage for Amazon SQS in the `s3` source plugin. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`queue_url` | Yes | String | The URL of the Amazon SQS queue from which messages are received. +`maximum_messages` | No | Integer | The maximum number of messages to receive from the Amazon SQS queue in any single request. Default is `10`. +`visibility_timeout` | No | Duration | The visibility timeout to apply to messages read from the Amazon SQS queue. This should be set to the amount of time that Data Prepper may take to read all the S3 objects in a batch. Default is `30s`. +`wait_time` | No | Duration | The amount of time to wait for long polling on the Amazon SQS API. Default is `20s`. +`poll_delay` | No | Duration | A delay placed between the reading and processing of a batch of Amazon SQS messages and making a subsequent request. Default is `0s`. +`visibility_duplication_protection` | No | Boolean | If set to `true`, Data Prepper attempts to avoid duplicate processing by extending the visibility timeout of SQS messages. Until the data reaches the sink, Data Prepper will regularly call `ChangeMessageVisibility` to avoid reading the S3 object again. To use this feature, you need to grant permissions to `ChangeMessageVisibility` on the IAM role. Default is `false`. +`visibility_duplicate_protection_timeout` | No | Duration | Sets the maximum total length of time that a message will not be processed when using `visibility_duplication_protection`. Defaults to two hours. + + +## aws + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`region` | No | String | The AWS Region to use for credentials. Defaults to [standard SDK behavior to determine the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon SQS and Amazon S3. Defaults to `null`, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`aws_sts_header_overrides` | No | Map | A map of header overrides that the IAM role assumes for the sink plugin. + +## codec + +The `codec` determines how the `s3` source parses each Amazon S3 object. For increased and more efficient performance, you can use [codec combinations]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/codec-processor-combinations/) with certain processors. + +### `newline` codec + +The `newline` codec parses each single line as a single log event. This is ideal for most application logs because each event parses per single line. It can also be suitable for S3 objects that have individual JSON objects on each line, which matches well when used with the [parse_json]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/parse-json/) processor to parse each line. + +Use the following options to configure the `newline` codec. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`skip_lines` | No | Integer | The number of lines to skip before creating events. You can use this configuration to skip common header rows. Default is `0`. +`header_destination` | No | String | A key value to assign to the header line of the S3 object. If this option is specified, then each event will contain a `header_destination` field. + +### json codec + +The `json` codec parses each S3 object as a single JSON object from a JSON array and then creates a Data Prepper log event for each object in the array. + +### csv codec + +The `csv` codec parses objects in comma-separated value (CSV) format, with each row producing a Data Prepper log event. Use the following options to configure the `csv` codec. + +Option | Required | Type | Description +:--- |:---------|:------------| :--- +`delimiter` | Yes | Integer | The delimiter separating columns. Default is `,`. +`quote_character` | Yes | String | The character used as a text qualifier for CSV data. Default is `"`. +`header` | No | String list | The header containing the column names used to parse CSV data. +`detect_header` | No | Boolean | Whether the first line of the Amazon S3 object should be interpreted as a header. Default is `true`. + + + + +## Using `s3_select` with the `s3` source + +When configuring `s3_select` to parse Amazon S3 objects, use the following options: + +Option | Required | Type | Description +:--- |:-----------------------|:------------| :--- +`expression` | Yes, when using `s3_select` | String | The expression used to query the object. Maps directly to the [expression](https://docs.aws.amazon.com/AmazonS3/latest/API/API_SelectObjectContent.html#AmazonS3-SelectObjectContent-request-Expression) property. +`expression_type` | No | String | The type of the provided expression. Default value is `SQL`. Maps directly to the [ExpressionType](https://docs.aws.amazon.com/AmazonS3/latest/API/API_SelectObjectContent.html#AmazonS3-SelectObjectContent-request-ExpressionType). +`input_serialization` | Yes, when using `s3_select` | String | Provides the S3 Select file format. Amazon S3 uses this format to parse object data into records and returns only records that match the specified SQL expression. May be `csv`, `json`, or `parquet`. +`compression_type` | No | String | Specifies an object's compression format. Maps directly to the [CompressionType](https://docs.aws.amazon.com/AmazonS3/latest/API/API_InputSerialization.html#AmazonS3-Type-InputSerialization-CompressionType). +`csv` | No | [csv](#s3_select_csv) | Provides the CSV configuration for processing CSV data. +`json` | No | [json](#s3_select_json) | Provides the JSON configuration for processing JSON data. + +### csv + +Use the following options in conjunction with the `csv` configuration for `s3_select` to determine how your parsed CSV file should be formatted. + +These options map directly to options available in the S3 Select [CSVInput](https://docs.aws.amazon.com/AmazonS3/latest/API/API_CSVInput.html) data type. + +Option | Required | Type | Description +:--- |:---------|:------------| :--- +`file_header_info` | No | String | Describes the first line of input. Maps directly to the [FileHeaderInfo](https://docs.aws.amazon.com/AmazonS3/latest/API/API_CSVInput.html#AmazonS3-Type-CSVInput-FileHeaderInfo) property. +`quote_escape` | No | String | A single character used for escaping the quotation mark character inside an already escaped value. Maps directly to the [QuoteEscapeCharacter](https://docs.aws.amazon.com/AmazonS3/latest/API/API_CSVInput.html#AmazonS3-Type-CSVInput-QuoteEscapeCharacter) property. +`comments` | No | String | A single character used to indicate that a row should be ignored when the character is present at the start of that row. Maps directly to the [Comments](https://docs.aws.amazon.com/AmazonS3/latest/API/API_CSVInput.html#AmazonS3-Type-CSVInput-Comments) property. + +#### json + +Use the following option in conjunction with `json` for `s3_select` to determine how S3 Select processes the JSON file. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`type` | No | String | The type of JSON array. May be either `DOCUMENT` or `LINES`. Maps directly to the [Type](https://docs.aws.amazon.com/AmazonS3/latest/API/API_JSONInput.html#AmazonS3-Type-JSONInput-Type) property. + +## Using `scan` with the `s3` source +The following parameters allow you to scan S3 objects. All options can be configured at the bucket level. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`start_time` | No | String | The time from which to start scanning objects modified after the given `start_time`. This should follow [ISO LocalDateTime](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_LOCAL_DATE_TIME) format, for example, `023-01-23T10:00:00`. If `end_time` is configured along with `start_time`, all objects after `start_time` and before `end_time` will be processed. `start_time` and `range` cannot be used together. +`end_time` | No | String | The time after which no objects will be scanned after the given `end_time`. This should follow [ISO LocalDateTime](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_LOCAL_DATE_TIME) format, for example, `023-01-23T10:00:00`. If `start_time` is configured along with `end_time`, all objects after `start_time` and before `end_time` will be processed. `end_time` and `range` cannot be used together. +`range` | No | String | The time range from which objects are scanned from all buckets. Supports ISO_8601 notation strings, such as `PT20.345S` or `PT15M`, and notation strings for seconds (`60s`) and milliseconds (`1600ms`). `start_time` and `end_time` cannot be used with `range`. Range `P12H` scans all the objects modified in the last 12 hours from the time pipeline started. +`buckets` | Yes | List | A list of [buckets](#bucket) to scan. +`scheduling` | No | List | The configuration for scheduling periodic scans on all buckets. `start_time`, `end_time` and `range` can not be used if scheduling is configured. + +### bucket + +Option | Required | Type | Description +:--- | :--- |:-----| :--- +`bucket` | Yes | Map | Provides options for each bucket. + +You can configure the following options inside the [bucket](#bucket) setting. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`name` | Yes | String | The string representing the S3 bucket name to scan. +`filter` | No | [Filter](#filter) | Provides the filter configuration. +`start_time` | No | String | The time from which to start scanning objects modified after the given `start_time`. This should follow [ISO LocalDateTime](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_LOCAL_DATE_TIME) format, for example, `023-01-23T10:00:00`. If `end_time` is configured along with `start_time`, all objects after `start_time` and before `end_time` will be processed. `start_time` and `range` cannot be used together. This will overwrites the `start_time` at the [scan](#scan) level. +`end_time` | No | String | The time after which no objects will be scanned after the given `end_time`. This should follow [ISO LocalDateTime](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_LOCAL_DATE_TIME) format, for example, `023-01-23T10:00:00`. If `start_time` is configured along with `end_time`, all objects after `start_time` and before `end_time` will be processed. This overwrites the `end_time` at the [scan](#scan) level. +`range` | No | String | The time range from which objects are scanned from all buckets. Supports ISO_8601 notation strings, such as `PT20.345S` or `PT15M`, and notation strings for seconds (`60s`) and milliseconds (`1600ms`). `start_time` and `end_time` cannot be used with `range`. Range `P12H` scans all the objects modified in the last 12 hours from the time pipeline started. This overwrites the `range` at the [scan](#scan) level. + +### filter + +Use the following options inside the `filter` configuration. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`include_prefix` | No | List | A list of S3 key prefix strings included in the scan. By default, all the objects in a bucket are included. +`exclude_suffix` | No | List | A list of S3 key suffix strings excluded from the scan. By default, no objects in a bucket are excluded. + +### scheduling + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`interval` | Yes | String | Indicates the minimum interval between each scan. The next scan in the interval will start after the interval duration from the last scan ends and when all the objects from the previous scan are processed. Supports ISO 8601 notation strings, such as `PT20.345S` or `PT15M`, and notation strings for seconds (`60s`) and milliseconds (`1600ms`). +`count` | No | Integer | Specifies how many times a bucket will be scanned. Defaults to `Integer.MAX_VALUE`. + + +## Metrics + +The `s3` source includes the following metrics: + +### Counters + +* `s3ObjectsFailed`: The number of S3 objects that the `s3` source failed to read. +* `s3ObjectsNotFound`: The number of S3 objects that the `s3` source failed to read due to an S3 "Not Found" error. These are also counted toward `s3ObjectsFailed`. +* `s3ObjectsAccessDenied`: The number of S3 objects that the `s3` source failed to read due to an "Access Denied" or "Forbidden" error. These are also counted toward `s3ObjectsFailed`. +* `s3ObjectsSucceeded`: The number of S3 objects that the `s3` source successfully read. +* `sqsMessagesReceived`: The number of Amazon SQS messages received from the queue by the `s3` source. +* `sqsMessagesDeleted`: The number of Amazon SQS messages deleted from the queue by the `s3` source. +* `sqsMessagesFailed`: The number of Amazon SQS messages that the `s3` source failed to parse. +* `s3ObjectNoRecordsFound` -- The number of S3 objects that resulted in 0 records added to the buffer by the `s3` source. +* `sqsMessagesDeleteFailed` -- The number of SQS messages that the `s3` source failed to delete from the SQS queue. +* `s3ObjectsDeleted` -- The number of S3 objects deleted by the `s3` source. +* `s3ObjectsDeleteFailed` -- The number of S3 objects that the `s3` source failed to delete. + +### Timers + +* `s3ObjectReadTimeElapsed`: Measures the amount of time the `s3` source takes to perform a request to GET an S3 object, parse it, and write events to the buffer. +* `sqsMessageDelay`: Measures the time elapsed from when S3 creates an object to when it is fully parsed. + +### Distribution summaries + +* `s3ObjectSizeBytes`: Measures the size of S3 objects as reported by the S3 `Content-Length`. For compressed objects, this is the compressed size. +* `s3ObjectProcessedBytes`: Measures the bytes processed by the `s3` source for a given object. For compressed objects, this is the uncompressed size. +* `s3ObjectsEvents`: Measures the number of events (sometimes called records) produced by an S3 object. + +## Example: Uncompressed logs with sqs + +The following pipeline.yaml file shows the minimum configuration for reading uncompressed newline-delimited logs: + +``` +source: + s3: + notification_type: sqs + codec: + newline: + compression: none + sqs: + queue_url: "https://sqs.us-east-1.amazonaws.com/123456789012/MyQueue" + aws: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/Data-Prepper" +``` + +## Example: Uncompressed logs with scan + +The following pipeline.yaml file shows the minimum configuration for scanning objects with uncompressed newline-delimited logs: + +``` +source: + s3: + codec: + newline: + compression: none + aws: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/Data-Prepper" + scan: + start_time: 2023-01-01T00:00:00 + range: "P365D" + buckets: + - bucket: + name: "s3-scan-test" + filter: + exclude_suffix: + - "*.log" +``` diff --git a/_data-prepper/pipelines/configuration/sources/sources.md b/_data-prepper/pipelines/configuration/sources/sources.md new file mode 100644 index 00000000..b684db56 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/sources.md @@ -0,0 +1,11 @@ +--- +layout: default +title: Sources +parent: Pipelines +has_children: true +nav_order: 15 +--- + +# Sources + +Sources define where your data comes from within a Data Prepper pipeline. diff --git a/_data-prepper/pipelines/dlq.md b/_data-prepper/pipelines/dlq.md new file mode 100644 index 00000000..3032536e --- /dev/null +++ b/_data-prepper/pipelines/dlq.md @@ -0,0 +1,83 @@ +--- +layout: default +title: Dead-letter queues +parent: Pipelines +nav_order: 13 +--- + +# Dead-letter queues + +Data Prepper pipelines support dead-letter queues (DLQs) for offloading failed events and making them accessible for analysis. + +As of Data Prepper 2.3, only the `s3` source supports DLQs. + +## Configure a DLQ writer + +To configure a DLQ writer for the `s3` source, add the following to your pipeline.yaml file: + +```yaml + sink: + opensearch: + dlq: + s3: + bucket: "my-dlq-bucket" + key_path_prefix: "dlq-files/" + region: "us-west-2" + sts_role_arn: "arn:aws:iam::123456789012:role/dlq-role" +``` + +The resulting DLQ file outputs as a JSON array of DLQ objects. Any file written to the S3 DLQ contains the following name pattern: + +``` +dlq-v${version}-${pipelineName}-${pluginId}-${timestampIso8601}-${uniqueId} +``` +The following information is replaced in the name pattern: + + +- `version`: The Data Prepper version. +- `pipelineName`: The pipeline name indicated in pipeline.yaml. +- `pluginId`: The ID of the plugin associated with the DLQ event. + +## Configuration + +DLQ supports the following configuration options. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +bucket | Yes | String | The name of the bucket into which the DLQ outputs failed records. +key_path_prefix | No | String | The `key_prefix` used in the S3 bucket. Defaults to `""`. Supports time value pattern variables, such as `/%{yyyy}/%{MM}/%{dd}`, including any variables listed in the [Java DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). For example, when using the `/%{yyyy}/%{MM}/%{dd}` pattern, you can set `key_prefix` as `/2023/01/24`. +region | No | String | The AWS Region of the S3 bucket. Defaults to `us-east-1`. +sts_role_arn | No | String | The STS role the DLQ assumes in order to write to an AWS S3 bucket. Default is `null`, which uses the standard SDK behavior for credentials. To use this option, the S3 bucket must have the `S3:PutObject` permission configured. + +When using DLQ with an OpenSearch sink, you can configure the [max_retries]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sinks/opensearch/#configure-max_retries) option to send failed data to the DLQ when the sink reaches the maximum number of retries. + + +## Metrics + +DLQ supports the following metrics. + +### Counter + +- `dlqS3RecordsSuccess`: Measures the number of successful records sent to S3. +- `dlqS3RecordsFailed`: Measures the number of records that failed to be sent to S3. +- `dlqS3RequestSuccess`: Measures the number of successful S3 requests. +- `dlqS3RequestFailed`: Measures the number of failed S3 requests. + +### Distribution summary + +- `dlqS3RequestSizeBytes`: Measures the distribution of the S3 request's payload size in bytes. + +### Timer + +- `dlqS3RequestLatency`: Measures latency when sending each S3 request, including retries. + +## DLQ objects + +DLQ supports the following DLQ objects: + +* `pluginId`: The ID of the plugin that originated the event sent to the DLQ. +* `pluginName`: The name of the plugin. +* `failedData` : An object that contains the failed object and its options. This object is unique to each plugin. +* `pipelineName`: The name of the Data Prepper pipeline in which the event failed. +* `timestamp`: The timestamp of the failures in an `ISO8601` format. + diff --git a/_data-prepper/pipelines/expression-syntax.md b/_data-prepper/pipelines/expression-syntax.md new file mode 100644 index 00000000..8257ab89 --- /dev/null +++ b/_data-prepper/pipelines/expression-syntax.md @@ -0,0 +1,247 @@ +--- +layout: default +title: Expression syntax +parent: Pipelines +nav_order: 12 +--- + +# Expression syntax + +The following sections provide information about expression syntax in Data Prepper. + +## Supported operators + +Operators are listed in order of precedence (top to bottom, left to right). + +| Operator | Description | Associativity | +|----------------------|-------------------------------------------------------|---------------| +| `()` | Priority Expression | left-to-right | +| `not`
`+`
`-`| Unary Logical NOT
Unary Positive
Unary negative | right-to-left | +| `<`, `<=`, `>`, `>=` | Relational Operators | left-to-right | +| `==`, `!=` | Equality Operators | left-to-right | +| `and`, `or` | Conditional Expression | left-to-right | + +## Reserved for possible future functionality + +Reserved symbol set: `^`, `*`, `/`, `%`, `+`, `-`, `xor`, `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `++`, `--`, `${}` + +## Set initializer + +The set initializer defines a set or term and/or expressions. + +### Examples + +The following are examples of set initializer syntax. + +#### HTTP status codes + +``` +{200, 201, 202} +``` + +#### HTTP response payloads + +``` +{"Created", "Accepted"} +``` + +#### Handle multiple event types with different keys + +``` +{/request_payload, /request_message} +``` + +## Priority expression + +A priority expression identifies an expression that will be evaluated at the highest priority level. A priority expression must contain an expression or value; empty parentheses are not supported. + +### Example + +``` +/is_cool == (/name == "Steven") +``` + +## Relational operators + +Relational operators are used to test the relationship of two numeric values. The operands must be numbers or JSON Pointers that resolve to numbers. + +### Syntax + +``` + < + <= + > + >= +``` + +### Example + +``` +/status_code >= 200 and /status_code < 300 +``` + +## Equality operators + +Equality operators are used to test whether two values are equivalent. + +### Syntax +``` + == + != +``` + +### Examples +``` +/is_cool == true +3.14 != /status_code +{1, 2} == /event/set_property +``` +## Using equality operators to check for a JSON Pointer + +Equality operators can also be used to check whether a JSON Pointer exists by comparing the value with `null`. + +### Syntax +``` + == null + != null +null == +null != +``` + +### Example +``` +/response == null +null != /response +``` + +#### Conditional expression + +A conditional expression is used to chain together multiple expressions and/or values. + +#### Syntax +``` + and + or +not +``` + +### Example +``` +/status_code == 200 and /message == "Hello world" +/status_code == 200 or /status_code == 202 +not /status_code in {200, 202} +/response == null +/response != null +``` + +## Definitions + +This section provides expression definitions. + +### Literal +A literal is a fundamental value that has no children: +- Float: Supports values from 3.40282347 × 1038 to 1.40239846 × 10−45. +- Integer: Supports values from −2,147,483,648 to 2,147,483,647. +- Boolean: Supports true or false. +- JSON Pointer: See the [JSON Pointer](#json-pointer) section for details. +- String: Supports valid Java strings. +- Null: Supports null check to see whether a JSON Pointer exists. + +### Expression string +An expression string takes the highest priority in a Data Prepper expression and only supports one expression string resulting in a return value. An _expression string_ is not the same as an _expression_. + +### Statement +A statement is the highest-priority component of an expression string. + +### Expression +An expression is a generic component that contains a _Primary_ or an _Operator_. Expressions may contain expressions. An expression's imminent children can contain 0–1 _Operators_. + +### Primary + +- _Set_ +- _Priority Expression_ +- _Literal_ + +### Operator +An operator is a hardcoded token that identifies the operation used in an _expression_. + +### JSON Pointer +A JSON Pointer is a literal used to reference a value within an event and provided as context for an _expression string_. JSON Pointers are identified by a leading `/` containing alphanumeric characters or underscores, delimited by `/`. JSON Pointers can use an extended character set if wrapped in double quotes (`"`) using the escape character `\`. Note that JSON Pointers require `~` and `/` characters, which should be used as part of the path and not as a delimiter that needs to be escaped. + +The following are examples of JSON Pointers: + +- `~0` representing `~` +- `~1` representing `/` + +#### Shorthand syntax (Regex, `\w` = `[A-Za-z_]`) +``` +/\w+(/\w+)* +``` + +#### Example of shorthand + +The following is an example of shorthand: + +``` +/Hello/World/0 +``` + +#### Example of escaped syntax + +The following is an example of escaped syntax: +``` +"/(/)*" +``` + +#### Example of an escaped JSON Pointer + +The following is an example of an escaped JSON Pointer: +``` +# Path +# { "Hello - 'world/" : [{ "\"JsonPointer\"": true }] } +"/Hello - 'world\//0/\"JsonPointer\"" +``` + +## White space + +White space is **optional** surrounding relational operators, regex equality operators, equality operators, and commas. +White space is **required** surrounding set initializers, priority expressions, set operators, and conditional expressions. + + +| Operator | Description | White space required | ✅ Valid examples | ❌ Invalid examples | +|----------------------|--------------------------|----------------------|----------------------------------------------------------------|---------------------------------------| +| `{}` | Set initializer | Yes | `/status in {200}` | `/status in{200}` | +| `()` | Priority expression | Yes | `/a==(/b==200)`
`/a in ({200})` | `/status in({200})` | +| `in`, `not in` | Set operators | Yes | `/a in {200}`
`/a not in {400}` | `/a in{200, 202}`
`/a not in{400}` | +| `<`, `<=`, `>`, `>=` | Relational operators | No | `/status < 300`
`/status>=300` | | +| `=~`, `!~` | Regex equality pperators | No | `/msg =~ "^\w*$"`
`/msg=~"^\w*$"` | | +| `==`, `!=` | Equality operators | No | `/status == 200`
`/status_code==200` | | +| `and`, `or`, `not` | Conditional operators | Yes | `/a<300 and /b>200` | `/b<300and/b>200` | +| `,` | Set value delimiter | No | `/a in {200, 202}`
`/a in {200,202}`
`/a in {200 , 202}` | `/a in {200,}` | + + +## Functions + +Data Prepper supports the following built-in functions that can be used in an expression. + +### `length()` + +The `length()` function takes one argument of the JSON pointer type and returns the length of the value passed. For example, `length(/message)` returns a length of `10` when a key message exists in the event and has a value of `1234567890`. + +### `hasTags()` + +The `hastags()` function takes one or more string type arguments and returns `true` if all the arguments passed are present in an event's tags. When an argument does not exist in the event's tags, the function returns `false`. For example, if you use the expression `hasTags("tag1")` and the event contains `tag1`, Data Prepper returns `true`. If you use the expression `hasTags("tag2")` but the event only contains a `tag1` tag, Data Prepper returns `false`. + +### `getMetadata()` + +The `getMetadata()` function takes one literal string argument to look up specific keys in a an event's metadata. If the key contains a `/`, then the function looks up the metadata recursively. When passed, the expression returns the value corresponding to the key. The value returned can be of any type. For example, if the metadata contains `{"key1": "value2", "key2": 10}`, then the function, `getMetadata("key1")`, returns `value2`. The function, `getMetadata("key2")`, returns 10. + +### `contains()` + +The `contains()` function takes two string arguments and determines whether either a literal string or a JSON pointer is contained within an event. When the second argument contains a substring of the first argument, such as `contains("abcde", "abcd")`, the function returns `true`. If the second argument does not contain any substrings, such as `contains("abcde", "xyz")`, it returns `false`. + +### `cidrContains()` + +The `cidrContains()` function takes two or more arguments. The first argument is a JSON pointer, which represents the key to the IP address that is checked. It supports both IPv4 and IPv6 addresses. Every argument that comes after the key is a string type that represents CIDR blocks that are checked against. + +If the IP address in the first argument is in the range of any of the given CIDR blocks, the function returns `true`. If the IP address is not in the range of the CIDR blocks, the function returns `false`. For example, `cidrContains(/sourceIp,"192.0.2.0/24","10.0.1.0/16")` will return `true` if the `sourceIp` field indicated in the JSON pointer has a value of `192.0.2.5`. diff --git a/_data-prepper/pipelines/pipelines-configuration-options.md b/_data-prepper/pipelines/pipelines-configuration-options.md new file mode 100644 index 00000000..5667906a --- /dev/null +++ b/_data-prepper/pipelines/pipelines-configuration-options.md @@ -0,0 +1,18 @@ +--- +layout: default +title: Pipeline options +parent: Pipelines +nav_order: 11 +--- + +# Pipeline options + +This page provides information about pipeline configuration options in Data Prepper. + +## General pipeline options + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +workers | No | Integer | Essentially the number of application threads. As a starting point for your use case, try setting this value to the number of CPU cores on the machine. Default is 1. +delay | No | Integer | Amount of time in milliseconds workers wait between buffer read attempts. Default is `3000`. + diff --git a/_data-prepper/pipelines/pipelines.md b/_data-prepper/pipelines/pipelines.md new file mode 100644 index 00000000..e897ed55 --- /dev/null +++ b/_data-prepper/pipelines/pipelines.md @@ -0,0 +1,352 @@ +--- +layout: default +title: Pipelines +has_children: true +nav_order: 10 +redirect_from: + - /data-prepper/pipelines/ + - /clients/data-prepper/pipelines/ +--- + +# Pipelines + +The following image illustrates how a pipeline works. + +Data Prepper pipeline{: .img-fluid} + +To use Data Prepper, you define pipelines in a configuration YAML file. Each pipeline is a combination of a source, a buffer, zero or more processors, and one or more sinks. For example: + +```yml +simple-sample-pipeline: + workers: 2 # the number of workers + delay: 5000 # in milliseconds, how long workers wait between read attempts + source: + random: + buffer: + bounded_blocking: + buffer_size: 1024 # max number of records the buffer accepts + batch_size: 256 # max number of records the buffer drains after each read + processor: + - string_converter: + upper_case: true + sink: + - stdout: +``` + +- Sources define where your data comes from. In this case, the source is a random UUID generator (`random`). + +- Buffers store data as it passes through the pipeline. + + By default, Data Prepper uses its one and only buffer, the `bounded_blocking` buffer, so you can omit this section unless you developed a custom buffer or need to tune the buffer settings. + +- Processors perform some action on your data: filter, transform, enrich, etc. + + You can have multiple processors, which run sequentially from top to bottom, not in parallel. The `string_converter` processor transform the strings by making them uppercase. + +- Sinks define where your data goes. In this case, the sink is stdout. + +Starting from Data Prepper 2.0, you can define pipelines across multiple configuration YAML files, where each file contains the configuration for one or more pipelines. This gives you more freedom to organize and chain complex pipeline configurations. For Data Prepper to load your pipeline configuration properly, place your configuration YAML files in the `pipelines` folder under your application's home directory (e.g. `/usr/share/data-prepper`). +{: .note } + +## End-to-end acknowledgments + +Data Prepper ensures the durability and reliability of data written from sources and delivered to sinks through end-to-end (E2E) acknowledgments. An E2E acknowledgment begins at the source, which monitors a batch of events set inside pipelines and waits for a positive acknowledgment when those events are successfully pushed to sinks. When a pipeline contains multiple sinks, including sinks set as additional Data Prepper pipelines, the E2E acknowledgment sends when events are received by the final sink in a pipeline chain. + +Alternatively, the source sends a negative acknowledgment when an event cannot be delivered to a sink for any reason. + +When any component of a pipeline fails and is unable to send an event, the source receives no acknowledgment. In the case of a failure, the pipeline's source times out. This gives you the ability to take any necessary actions to address the source failure, including rerunning the pipeline or logging the failure. + + +## Conditional routing + +Pipelines also support **conditional routing** which allows you to route events to different sinks based on specific conditions. To add conditional routing to a pipeline, specify a list of named routes under the `route` component and add specific routes to sinks under the `routes` property. Any sink with the `routes` property will only accept events that match at least one of the routing conditions. + +In the following example, `application-logs` is a named route with a condition set to `/log_type == "application"`. The route uses [Data Prepper expressions](https://github.com/opensearch-project/data-prepper/tree/main/examples) to define the conditions. Data Prepper only routes events that satisfy the condition to the first OpenSearch sink. By default, Data Prepper routes all events to a sink which does not define a route. In the example, all events route into the third OpenSearch sink. + +```yml +conditional-routing-sample-pipeline: + source: + http: + processor: + route: + - application-logs: '/log_type == "application"' + - http-logs: '/log_type == "apache"' + sink: + - opensearch: + hosts: [ "https://opensearch:9200" ] + index: application_logs + routes: [application-logs] + - opensearch: + hosts: [ "https://opensearch:9200" ] + index: http_logs + routes: [http-logs] + - opensearch: + hosts: [ "https://opensearch:9200" ] + index: all_logs +``` + + +## Examples + +This section provides some pipeline examples that you can use to start creating your own pipelines. For more pipeline configurations, select from the following options for each component: + +- [Buffers]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/buffers/buffers/) +- [Processors]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/processors/) +- [Sinks]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sinks/sinks/) +- [Sources]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/sources/) + +The Data Prepper repository has several [sample applications](https://github.com/opensearch-project/data-prepper/tree/main/examples) to help you get started. + +### Log ingestion pipeline + +The following example `pipeline.yaml` file with SSL and basic authentication enabled for the `http-source` demonstrates how to use the HTTP Source and Grok Prepper plugins to process unstructured log data: + + +```yaml +log-pipeline: + source: + http: + ssl_certificate_file: "/full/path/to/certfile.crt" + ssl_key_file: "/full/path/to/keyfile.key" + authentication: + http_basic: + username: "myuser" + password: "mys3cret" + processor: + - grok: + match: + # This will match logs with a "log" key against the COMMONAPACHELOG pattern (ex: { "log": "actual apache log..." } ) + # You should change this to match what your logs look like. See the grok documenation to get started. + log: [ "%{COMMONAPACHELOG}" ] + sink: + - opensearch: + hosts: [ "https://localhost:9200" ] + # Change to your credentials + username: "admin" + password: "admin" + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + #cert: /path/to/cert + # If you are connecting to an Amazon OpenSearch Service domain without + # Fine-Grained Access Control, enable these settings. Comment out the + # username and password above. + #aws_sigv4: true + #aws_region: us-east-1 + # Since we are Grok matching for Apache logs, it makes sense to send them to an OpenSearch index named apache_logs. + # You should change this to correspond with how your OpenSearch indexes are set up. + index: apache_logs +``` + +This example uses weak security. We strongly recommend securing all plugins which open external ports in production environments. +{: .note} + +### Trace analytics pipeline + +The following example demonstrates how to build a pipeline that supports the [Trace Analytics OpenSearch Dashboards plugin]({{site.url}}{{site.baseurl}}/observability-plugin/trace/ta-dashboards/). This pipeline takes data from the OpenTelemetry Collector and uses two other pipelines as sinks. These two separate pipelines index trace and the service map documents for the dashboard plugin. + +Starting from Data Prepper 2.0, Data Prepper no longer supports `otel_trace_raw_prepper` processor due to the Data Prepper internal data model evolution. +Instead, users should use `otel_trace_raw`. + +```yml +entry-pipeline: + delay: "100" + source: + otel_trace_source: + ssl: false + buffer: + bounded_blocking: + buffer_size: 10240 + batch_size: 160 + sink: + - pipeline: + name: "raw-pipeline" + - pipeline: + name: "service-map-pipeline" +raw-pipeline: + source: + pipeline: + name: "entry-pipeline" + buffer: + bounded_blocking: + buffer_size: 10240 + batch_size: 160 + processor: + - otel_trace_raw: + sink: + - opensearch: + hosts: ["https://localhost:9200"] + insecure: true + username: admin + password: admin + index_type: trace-analytics-raw +service-map-pipeline: + delay: "100" + source: + pipeline: + name: "entry-pipeline" + buffer: + bounded_blocking: + buffer_size: 10240 + batch_size: 160 + processor: + - service_map_stateful: + sink: + - opensearch: + hosts: ["https://localhost:9200"] + insecure: true + username: admin + password: admin + index_type: trace-analytics-service-map +``` + +To maintain similar ingestion throughput and latency, scale the `buffer_size` and `batch_size` by the estimated maximum batch size in the client request payload. +{: .tip} + +### Metrics pipeline + +Data Prepper supports metrics ingestion using OTel. It currently supports the following metric types: + +* Gauge +* Sum +* Summary +* Histogram + +Other types are not supported. Data Prepper drops all other types, including Exponential Histogram and Summary. Additionally, Data Prepper does not support Scope instrumentation. + +To set up a metrics pipeline: + +```yml +metrics-pipeline: + source: + otel_metrics_source: + processor: + - otel_metrics_raw_processor: + sink: + - opensearch: + hosts: ["https://localhost:9200"] + username: admin + password: admin +``` + +### S3 log ingestion pipeline + +The following example demonstrates how to use the S3Source and Grok Processor plugins to process unstructured log data from [Amazon Simple Storage Service](https://aws.amazon.com/s3/) (Amazon S3). This example uses application load balancer logs. As the application load balancer writes logs to S3, S3 creates notifications in Amazon SQS. Data Prepper monitors those notifications and reads the S3 objects to get the log data and process it. + +```yml +log-pipeline: + source: + s3: + notification_type: "sqs" + compression: "gzip" + codec: + newline: + sqs: + queue_url: "https://sqs.us-east-1.amazonaws.com/12345678910/ApplicationLoadBalancer" + aws: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::12345678910:role/Data-Prepper" + + processor: + - grok: + match: + message: ["%{DATA:type} %{TIMESTAMP_ISO8601:time} %{DATA:elb} %{DATA:client} %{DATA:target} %{BASE10NUM:request_processing_time} %{DATA:target_processing_time} %{BASE10NUM:response_processing_time} %{BASE10NUM:elb_status_code} %{DATA:target_status_code} %{BASE10NUM:received_bytes} %{BASE10NUM:sent_bytes} \"%{DATA:request}\" \"%{DATA:user_agent}\" %{DATA:ssl_cipher} %{DATA:ssl_protocol} %{DATA:target_group_arn} \"%{DATA:trace_id}\" \"%{DATA:domain_name}\" \"%{DATA:chosen_cert_arn}\" %{DATA:matched_rule_priority} %{TIMESTAMP_ISO8601:request_creation_time} \"%{DATA:actions_executed}\" \"%{DATA:redirect_url}\" \"%{DATA:error_reason}\" \"%{DATA:target_list}\" \"%{DATA:target_status_code_list}\" \"%{DATA:classification}\" \"%{DATA:classification_reason}"] + - grok: + match: + request: ["(%{NOTSPACE:http_method})? (%{NOTSPACE:http_uri})? (%{NOTSPACE:http_version})?"] + - grok: + match: + http_uri: ["(%{WORD:protocol})?(://)?(%{IPORHOST:domain})?(:)?(%{INT:http_port})?(%{GREEDYDATA:request_uri})?"] + - date: + from_time_received: true + destination: "@timestamp" + + + sink: + - opensearch: + hosts: [ "https://localhost:9200" ] + username: "admin" + password: "admin" + index: alb_logs +``` + +## Migrating from Logstash + +Data Prepper supports Logstash configuration files for a limited set of plugins. Simply use the logstash config to run Data Prepper. + +```bash +docker run --name data-prepper \ + -v /full/path/to/logstash.conf:/usr/share/data-prepper/pipelines/pipelines.conf \ + opensearchproject/opensearch-data-prepper:latest +``` + +This feature is limited by feature parity of Data Prepper. As of Data Prepper 1.2 release, the following plugins from the Logstash configuration are supported: + +- HTTP Input plugin +- Grok Filter plugin +- Elasticsearch Output plugin +- Amazon Elasticsearch Output plugin + +## Configure the Data Prepper server + +Data Prepper itself provides administrative HTTP endpoints such as `/list` to list pipelines and `/metrics/prometheus` to provide Prometheus-compatible metrics data. The port that has these endpoints has a TLS configuration and is specified by a separate YAML file. By default, these endpoints are secured by Data Prepper docker images. We strongly recommend providing your own configuration file for securing production environments. Here is an example `data-prepper-config.yaml`: + +```yml +ssl: true +keyStoreFilePath: "/usr/share/data-prepper/keystore.jks" +keyStorePassword: "password" +privateKeyPassword: "other_password" +serverPort: 1234 +``` + +To configure the Data Prepper server, run Data Prepper with the additional yaml file. + +```bash +docker run --name data-prepper \ + -v /full/path/to/my-pipelines.yaml:/usr/share/data-prepper/pipelines/my-pipelines.yaml \ + -v /full/path/to/data-prepper-config.yaml:/usr/share/data-prepper/data-prepper-config.yaml \ + opensearchproject/data-prepper:latest +``` + +## Configure peer forwarder + +Data Prepper provides an HTTP service to forward events between Data Prepper nodes for aggregation. This is required for operating Data Prepper in a clustered deployment. Currently, peer forwarding is supported in `aggregate`, `service_map_stateful`, and `otel_trace_raw` processors. Peer forwarder groups events based on the identification keys provided by the processors. For `service_map_stateful` and `otel_trace_raw` it's `traceId` by default and can not be configured. For `aggregate` processor, it is configurable using `identification_keys` option. + +Peer forwarder supports peer discovery through one of three options: a static list, a DNS record lookup , or AWS Cloud Map. Peer discovery can be configured using `discovery_mode` option. Peer forwarder also supports SSL for verification and encryption, and mTLS for mutual authentication in a peer forwarding service. + +To configure peer forwarder, add configuration options to `data-prepper-config.yaml` mentioned in the [Configure the Data Prepper server](#configure-the-data-prepper-server) section: + +```yml +peer_forwarder: + discovery_mode: dns + domain_name: "data-prepper-cluster.my-domain.net" + ssl: true + ssl_certificate_file: "" + ssl_key_file: "" + authentication: + mutual_tls: +``` + + +## Pipeline Configurations + +Since Data Prepper 2.5, shared pipeline components can be configured under the reserved section `pipeline_configurations` when all pipelines are defined in a single pipeline configuration YAML file. +Shared pipeline configurations can include certain components within [Extension Plugins]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/configuring-data-prepper/#extension-plugins), as shown in the following example that refers to secrets configurations for an `opensearch` sink: + +```json +pipeline_configurations: + aws: + secrets: + credential-secret-config: + secret_id: + region: + sts_role_arn: +simple-sample-pipeline: + ... + sink: + - opensearch: + hosts: [ {% raw %}"${{aws_secrets:host-secret-config}}"{% endraw %} ] + username: {% raw %}"${{aws_secrets:credential-secret-config:username}}"{% endraw %} + password: {% raw %}"${{aws_secrets:credential-secret-config:password}}"{% endraw %} + index: "test-migration" +``` + +When the same component is defined in both `pipelines.yaml` and `data-prepper-config.yaml`, the definition in the `pipelines.yaml` will overwrite the counterpart in `data-prepper-config.yaml`. For more information on shared pipeline components, see [AWS secrets extension plugin]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/configuring-data-prepper/#aws-secrets-extension-plugin) for details. diff --git a/_data/alert.yml b/_data/alert.yml index ecfc87f2..2d9db036 100644 --- a/_data/alert.yml +++ b/_data/alert.yml @@ -1 +1 @@ -message: "🌡️ [OpenSearch 1.1.0 arrived October 5 with cross-cluster replication, bucket-level alerting, and much, much more. Grab it here!](/downloads.html)" +message: false diff --git a/_data/footer.yml b/_data/footer.yml index 33366c47..1b4f21b8 100644 --- a/_data/footer.yml +++ b/_data/footer.yml @@ -4,13 +4,14 @@ columns: links: - title: Code of Conduct - url: 'https://opensearch.org/codeofconduct.html' + url: '/codeofconduct.html' - title: 'OpenSearch 中文论坛' url: 'https://www.ossez.com/tag/opensearch' - title: '官方论坛' url: 'https://discuss.opendistrocommunity.dev/' + - title: '中文文档代码仓库 中文论坛' url: 'https://github.com/cwiki-us-docs/opensearch-docs-cn' @@ -18,38 +19,53 @@ columns: title: '官方 Github' url: 'https://github.com/opensearch-project' - - title: '合作方' - url: '/partners/' + title: 'Slack' + url: '/slack.html' - title: '社区项目' url: '/community_projects' - title: '资源(Resources)' links: - #- - # title: 'Documentation' - # url: 'https://github.com/opensearch/documentation' - + title: About + url: '/about.html' + - + title: Release Schedule + url: '/releases.html' + - + title: Maintenance Policy + url: '/releases.html#maintenance-policy' + - title: FAQ url: '/faq/' - - title: 'Brand Guidelines' - url: '/brand.html' + title: 'Testimonials' + url: '/testimonials/' - - title: 'Trademark Usage Policy' - url: '/trademark-usage.html' + title: 'Trademark and Brand Policy' + url: '/trademark-brand-policy.html' - - title: OpenSearch Disambiguation - url: '/disambiguation.html' + title: 'Privacy' + url: 'https://aws.amazon.com/privacy/' - - title: '联系我们(Connect)' + title: 'Contact Us' links: - # - - # title: 'Twitter' - # url: 'https://twitter.com/opensearch_project' - #- - # title: 'Facebook' - # url: 'http://www.facebook.com/opensearch' - - title: 'E-mail' - url: 'mailto:opensearch@amazon.com' \ No newline at end of file + title: 'Connect' + url: '/connect.html' + - + title: 'Twitter' + url: 'https://twitter.com/OpenSearchProj' + - + title: 'LinkedIn' + url: 'https://www.linkedin.com/company/opensearch-project/' + - + title: 'YouTube' + url: 'https://www.youtube.com/c/OpenSearchProject' + - + title: 'Meetup' + url: 'https://www.meetup.com/pro/opensearchproject/' + - + title: 'Facebook' + url: 'https://www.facebook.com/OpenSearchProject/' diff --git a/_data/top_nav.yml b/_data/top_nav.yml new file mode 100644 index 00000000..b02c0632 --- /dev/null +++ b/_data/top_nav.yml @@ -0,0 +1,133 @@ +items: + - + label: OpenSearchCon + fragments: + - opensearchcon + - sessions + - speakers + - exhibitors + - workshops + - unconference + - opensearchcon2023-cfp + children: + - + label: 2024 - Stay Informed + url: /events/opensearchcon/ + - + label: Sessions + url: /events/opensearchcon/sessions/ + - + label: Speakers + url: /events/opensearchcon/speakers/ + - + label: Exhibitors + url: /events/opensearchcon/exhibitors/ + - + label: Workshops + url: /events/opensearchcon/workshops/ + - + label: Unconference + url: /events/opensearchcon/unconference/ + - + label: CFP is closed + url: /opensearchcon2023-cfp.html + - + label: Download + fragments: + - downloads + - versions + url: /downloads.html + - + label: About + url: /about.html + fragments: + - about + - releases + - roadmap + - faq + children: + - + label: Releases + url: /releases.html + - + label: Roadmap + url: https://github.com/orgs/opensearch-project/projects/1 + - + label: FAQ + url: /faq + - + label: Community + fragments: + - events + - community_projects + - blog + - partners + - slack + children: + - + label: Blog + url: /blog + - + label: Forum + url: https://forum.opensearch.org/ + - + label: Slack + url: /slack.html + - + label: Events + url: /events + - + label: Partners + url: /partners + - + label: Projects + url: /community_projects + - + label: Documentation + fragments: + - docs + - opensearch + - data-prepper + - clients + - benchmark + url: /docs/latest/ + children: + - + label: OpenSearch and Dashboards + url: /docs/latest/about/ + - + label: Data Prepper + url: /docs/latest/data-prepper/ + - + label: Clients + url: /docs/latest/clients/ + - + label: Benchmark + url: /docs/latest/benchmark/ + - + label: Platform + url: /platform/index.html + fragments: + - platform + - search + - observability + - security-analytics + - vector-database + - benchmarks + children: + - label: Search + url: /platform/search/index.html + - label: Observability + url: /platform/observability/index.html + - + label: Security Analytics + url: /platform/security-analytics/index.html + - + label: Vector Database + url: /platform/search/vector-database.html + - + label: Playground Demo + url: https://playground.opensearch.org/ + - + label: Performance Benchmarks + url: /benchmarks diff --git a/_data/versions.json b/_data/versions.json index 5fe13f29..1dd727f3 100644 --- a/_data/versions.json +++ b/_data/versions.json @@ -1,6 +1,27 @@ { - "current": "1.1", - "past": [ + "current": "2.12", + "all": [ + "2.12", + "1.3" + ], + "archived": [ + "2.11", + "2.10", + "2.9", + "2.8", + "2.7", + "2.6", + "2.5", + "2.4", + "2.3", + "2.2", + "2.1", + "2.0", + "1.2", + "1.1", "1.0" - ] -} \ No newline at end of file + ], + "latest": "2.12" +} + + diff --git a/_developer-documentation/extensions.md b/_developer-documentation/extensions.md new file mode 100644 index 00000000..fd1e279f --- /dev/null +++ b/_developer-documentation/extensions.md @@ -0,0 +1,46 @@ +--- +layout: default +title: Extensions +nav_order: 10 +--- + +# Extensions + +Extensions is an experimental feature. Therefore, we do not recommend the use of extensions in a production environment. For updates on the progress of extensions, or if you want leave feedback that could help improve the feature, refer to the [issue on GitHub](https://github.com/opensearch-project/OpenSearch/issues/2447). +{: .warning} + +Until extensions were introduced, plugins were the only way to extend OpenSearch functionality. However, plugins have significant shortcomings: they require frequent updates to stay up to date with OpenSearch core, they pose a security risk because they run in the same process as OpenSearch, and updating or installing them requires a full cluster restart. Moreover, plugins can fatally impact the cluster in the event of failure. + +Extensions provide an easier, more secure way to customize OpenSearch. Extensions support all plugin functionality and let you build additional modular features for OpenSearch. The [OpenSearch SDK for Java](https://github.com/opensearch-project/opensearch-sdk-java/) provides the library of classes and interfaces that you can use to develop extensions. Extensions are decoupled from OpenSearch core and do not need frequent updates. Additionally, they can run in a separate process or on another node and can be installed while a cluster is running. + +## Getting started + +Use the following documentation to get started with extensions: + +### Step 1: Learn the basics + +Read the [design documentation](https://opensearch-project.github.io/opensearch-sdk-java/DESIGN.html) to learn about extension architecture and how extensions work. + +### Step 2: Try it out + +Try running the sample Hello World extension by following detailed steps in the [Getting started section of the Developer Guide](https://opensearch-project.github.io/opensearch-sdk-java/DEVELOPER_GUIDE.html#getting-started). + +### Step 3: Create your own extension + +Develop a custom create, read, update, delete (CRUD) extension by following the instructions in [this tutorial](https://opensearch-project.github.io/opensearch-sdk-java/CREATE_YOUR_FIRST_EXTENSION.html). + +### Step 4: Learn how to deploy your extension + +For instructions on building, testing, and running an extension, see the [Developing your own extension section of the Developer Guide](https://opensearch-project.github.io/opensearch-sdk-java/DEVELOPER_GUIDE.html#developing-your-own-extension). + + + +## Plugin migration + +The [Anomaly Detection plugin](https://github.com/opensearch-project/anomaly-detection) is now [implemented as an extension](https://github.com/opensearch-project/anomaly-detection/tree/feature/extensions). For details, see [this GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/3635). + +For tips on migrating an existing plugin to an extension, see the [plugin migration documentation](https://opensearch-project.github.io/opensearch-sdk-java/PLUGIN_MIGRATION.html). \ No newline at end of file diff --git a/_developer-documentation/index.md b/_developer-documentation/index.md new file mode 100644 index 00000000..46ea7dd8 --- /dev/null +++ b/_developer-documentation/index.md @@ -0,0 +1,27 @@ +--- +layout: default +title: Developer documentation +nav_order: 1 +has_children: false +has_toc: false +nav_exclude: true +permalink: /developer-documentation/ +redirect_from: + - /developer-documentation/index/ +--- + +# Developer documentation + +We welcome your contributions to the OpenSearch Project. Here are some helpful links to explore the OpenSearch repositories and learn how to contribute: + +- [OpenSearch Project GitHub repo](https://github.com/opensearch-project/) +- [Javadoc documentation](https://opensearch.org/javadocs/) +- [Getting started as an OpenSearch contributor](https://github.com/opensearch-project/.github/blob/main/ONBOARDING.md) +- [OpenSearch Dashboards Developer Guide](https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/DEVELOPER_GUIDE.md) +- [OpenSearch release schedule and maintenance policy](https://opensearch.org/releases.html) +- [OpenSearch Project roadmap](https://github.com/orgs/opensearch-project/projects/1) +- [OpenSearch Community Forum](https://forum.opensearch.org/) + +## What's new + +New in version 2.9, OpenSearch introduces _extensions_---an easier-to-develop and more secure alternative to plugins---to simplify creating custom functionality for OpenSearch. To learn more about building extensions using _OpenSearch SDK for Java_, see [Extensions]({{site.url}}{{site.baseurl}}/developer-documentation/extensions/). diff --git a/_external_links/developer-guide.md b/_external_links/developer-guide.md deleted file mode 100644 index 5f07b6ae..00000000 --- a/_external_links/developer-guide.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -layout: default -title: Dashboards developer guide -nav_order: 2 -permalink: /dashboards-developer-guide/ -redirect_to: https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/DEVELOPER_GUIDE.md ---- diff --git a/_external_links/javadoc.md b/_external_links/javadoc.md deleted file mode 100644 index 8c937391..00000000 --- a/_external_links/javadoc.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -layout: default -title: Javadoc -nav_order: 1 -permalink: /javadoc/ -redirect_to: https://opensearch.org/javadocs/ ---- diff --git a/_external_links/javadocs.md b/_external_links/javadocs.md deleted file mode 100644 index bdaf076d..00000000 --- a/_external_links/javadocs.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -layout: default -nav_exclude: true -permalink: /javadocs/ -redirect_to: https://opensearch.org/javadocs/ ---- diff --git a/_field-types/index.md b/_field-types/index.md new file mode 100644 index 00000000..7a7e816a --- /dev/null +++ b/_field-types/index.md @@ -0,0 +1,222 @@ +--- +layout: default +title: Mappings and field types +nav_order: 1 +nav_exclude: true +permalink: /field-types/ +redirect_from: + - /opensearch/mappings/ + - /field-types/mappings/ + - /field-types/index/ +--- + +# Mappings and field types + +You can define how documents and their fields are stored and indexed by creating a _mapping_. The mapping specifies the list of fields for a document. Every field in the document has a _field type_, which defines the type of data the field contains. For example, you may want to specify that the `year` field should be of type `date`. To learn more, see [Supported field types]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/index/). + +If you're just starting to build out your cluster and data, you may not know exactly how your data should be stored. In those cases, you can use dynamic mappings, which tell OpenSearch to dynamically add data and its fields. However, if you know exactly what types your data falls under and want to enforce that standard, then you can use explicit mappings. + +For example, if you want to indicate that `year` should be of type `text` instead of an `integer`, and `age` should be an `integer`, you can do so with explicit mappings. By using dynamic mapping, OpenSearch might interpret both `year` and `age` as integers. + +This section provides an example for how to create an index mapping and how to add a document to it that will get ip_range validated. + +#### Table of contents +1. TOC +{:toc} + + +--- +## Dynamic mapping + +When you index a document, OpenSearch adds fields automatically with dynamic mapping. You can also explicitly add fields to an index mapping. + +#### Dynamic mapping types + +Type | Description +:--- | :--- +null | A `null` field can't be indexed or searched. When a field is set to null, OpenSearch behaves as if that field has no values. +boolean | OpenSearch accepts `true` and `false` as boolean values. An empty string is equal to `false.` +float | A single-precision 32-bit floating point number. +double | A double-precision 64-bit floating point number. +integer | A signed 32-bit number. +object | Objects are standard JSON objects, which can have fields and mappings of their own. For example, a `movies` object can have additional properties such as `title`, `year`, and `director`. +array | Arrays in OpenSearch can only store values of one type, such as an array of just integers or strings. Empty arrays are treated as though they are fields with no values. +text | A string sequence of characters that represent full-text values. +keyword | A string sequence of structured characters, such as an email address or ZIP code. +date detection string | Enabled by default, if new string fields match a date's format, then the string is processed as a `date` field. For example, `date: "2012/03/11"` is processed as a date. +numeric detection string | If disabled, OpenSearch may automatically process numeric values as strings when they should be processed as numbers. When enabled, OpenSearch can process strings into `long`, `integer`, `short`, `byte`, `double`, `float`, `half_float`, `scaled_float`, and `unsigned_long`. Default is disabled. + +## Explicit mapping + +If you know exactly what your field data types need to be, you can specify them in your request body when creating your index. + +```json +PUT sample-index1 +{ + "mappings": { + "properties": { + "year": { "type" : "text" }, + "age": { "type" : "integer" }, + "director":{ "type" : "text" } + } + } +} +``` + +### Response +```json +{ + "acknowledged": true, + "shards_acknowledged": true, + "index": "sample-index1" +} +``` + +To add mappings to an existing index or data stream, you can send a request to the `_mapping` endpoint using the `PUT` or `POST` HTTP method: + +```json +POST sample-index1/_mapping +{ + "properties": { + "year": { "type" : "text" }, + "age": { "type" : "integer" }, + "director":{ "type" : "text" } + } +} +``` + +You cannot change the mapping of an existing field, you can only modify the field's mapping parameters. +{: .note} + +--- +## Mapping example usage + +The following example shows how to create a mapping to specify that OpenSearch should ignore any documents with malformed IP addresses that do not conform to the [`ip`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/ip/) data type. You accomplish this by setting the `ignore_malformed` parameter to `true`. + +### Create an index with an `ip` mapping + +To create an index, use a PUT request: + +```json +PUT /test-index +{ + "mappings" : { + "properties" : { + "ip_address" : { + "type" : "ip", + "ignore_malformed": true + } + } + } +} +``` + +You can add a document that has a malformed IP address to your index: + +```json +PUT /test-index/_doc/1 +{ + "ip_address" : "malformed ip address" +} +``` + +This indexed IP address does not throw an error because `ignore_malformed` is set to true. + +You can query the index using the following request: + +```json +GET /test-index/_search +``` + +The response shows that the `ip_address` field is ignored in the indexed document: + +```json +{ + "took": 14, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "test-index", + "_id": "1", + "_score": 1, + "_ignored": [ + "ip_address" + ], + "_source": { + "ip_address": "malformed ip address" + } + } + ] + } +} +``` + +## Get a mapping + +To get all mappings for one or more indexes, use the following request: + +```json +GET /_mapping +``` + +In the above request, `` may be an index name or a comma-separated list of index names. + +To get all mappings for all indexes, use the following request: + +```json +GET _mapping +``` + +To get a mapping for a specific field, provide the index name and the field name: + +```json +GET _mapping/field/ +GET //_mapping/field/ +``` + +Both `` and `` can be specified as one value or a comma-separated list. + +For example, the following request retrieves the mapping for the `year` and `age` fields in `sample-index1`: + +```json +GET sample-index1/_mapping/field/year,age +``` + +The response contains the specified fields: + +```json +{ + "sample-index1" : { + "mappings" : { + "year" : { + "full_name" : "year", + "mapping" : { + "year" : { + "type" : "text" + } + } + }, + "age" : { + "full_name" : "age", + "mapping" : { + "age" : { + "type" : "integer" + } + } + } + } + } +} +``` diff --git a/_field-types/supported-field-types/alias.md b/_field-types/supported-field-types/alias.md new file mode 100644 index 00000000..29cc5888 --- /dev/null +++ b/_field-types/supported-field-types/alias.md @@ -0,0 +1,95 @@ +--- +layout: default +title: Alias +nav_order: 10 +has_children: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/alias/ + - /field-types/alias/ +--- + +# Alias field type + +An alias field type creates another name for an existing field. You can use aliases in the[search](#using-aliases-in-search-api-operations) and [field capabilities](#using-aliases-in-field-capabilities-api-operations) API operations, with some [exceptions](#exceptions). To set up an [alias](#alias-field), you need to specify the [original field](#original-field) name in the `path` parameter. + +## Example + +```json +PUT movies +{ + "mappings" : { + "properties" : { + "year" : { + "type" : "date" + }, + "release_date" : { + "type" : "alias", + "path" : "year" + } + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +Parameter | Description +:--- | :--- +`path` | The full path to the original field, including all parent objects. For example, parent.child.field_name. Required. + +## Alias field + +Alias fields must obey the following rules: + +- An alias field can only have one original field. +- In nested objects, the alias must have the same nesting level as the original field. + +To change the field that the alias references, update the mappings. Note that aliases in any previously stored percolator queries will still reference the original field. +{: .note } + +## Original field + +The original field for an alias must obey the following rules: +- The original field must be created before the alias is created. +- The original field cannot be an object or another alias. + +## Using aliases in search API operations + +You can use aliases in the following read operations of the search API: +- Queries +- Sorts +- Aggregations +- `stored_fields` +- `docvalue_fields` +- Suggestions +- Highlights +- Scripts that access field values + +## Using aliases in field capabilities API operations + +To use an alias in the field capabilities API, specify it in the fields parameter. + +```json +GET movies/_field_caps?fields=release_date +``` +{% include copy-curl.html %} + +## Exceptions + +You cannot use aliases in the following situations: +- In write requests, such as update requests. +- In multi-fields or as a target of `copy_to`. +- As a _source parameter for filtering results. +- In APIs that take field names, such as term vectors. +- In `terms`, `more_like_this`, and `geo_shape` queries (aliases are not supported when retrieving documents). + +## Wildcards + +In search and field capabilities wildcard queries, both the original field and the alias are matched against the wildcard pattern. + +```json +GET movies/_field_caps?fields=release* +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_field-types/supported-field-types/autocomplete.md b/_field-types/supported-field-types/autocomplete.md new file mode 100644 index 00000000..79b98aa1 --- /dev/null +++ b/_field-types/supported-field-types/autocomplete.md @@ -0,0 +1,20 @@ +--- +layout: default +title: Autocomplete field types +nav_order: 50 +has_children: true +has_toc: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/autocomplete/ + - /field-types/autocomplete/ +--- + +# Autocomplete field types + +The following table lists all autocomplete field types that OpenSearch supports. + +Field data type | Description +:--- | :--- +[`completion`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/completion/) | A completion suggester that provides autocomplete functionality using prefix completion. You need to upload a list of all possible completions into the index before using this feature. +[`search_as_you_type`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/search-as-you-type/) | Provides search-as-you-type functionality using both prefix and infix completion. diff --git a/_field-types/supported-field-types/binary.md b/_field-types/supported-field-types/binary.md new file mode 100644 index 00000000..d6974ad4 --- /dev/null +++ b/_field-types/supported-field-types/binary.md @@ -0,0 +1,54 @@ +--- +layout: default +title: Binary +parent: Supported field types +nav_order: 12 +has_children: false +redirect_from: + - /opensearch/supported-field-types/binary/ + - /field-types/binary/ +--- + +# Binary field type + +A binary field type contains a binary value in [Base64](https://en.wikipedia.org/wiki/Base64) encoding that is not searchable. + +## Example + +Create a mapping with a binary field: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "binary_value" : { + "type" : "binary" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with a binary value: + +```json +PUT testindex/_doc/1 +{ + "binary_value" : "bGlkaHQtd29rfx4=" +} +``` +{% include copy-curl.html %} + +Use `=` as a padding character. Embedded newline characters are not allowed. +{: .note } + +## Parameters + +The following table lists the parameters accepted by binary field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`doc_values` | A Boolean value that specifies whether the field should be stored on disk so that it can be used for aggregations, sorting, or scripting. Optional. Default is `true`. +`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the _source field. Optional. Default is `false`. \ No newline at end of file diff --git a/_field-types/supported-field-types/boolean.md b/_field-types/supported-field-types/boolean.md new file mode 100644 index 00000000..8233a45a --- /dev/null +++ b/_field-types/supported-field-types/boolean.md @@ -0,0 +1,157 @@ +--- +layout: default +title: Boolean +nav_order: 20 +has_children: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/boolean/ + - /field-types/boolean/ +--- + +# Boolean field type + +A Boolean field type takes `true` or `false` values, or `"true"` or `"false"` strings. You can also pass an empty string (`""`) in place of a `false` value. + +## Example + +Create a mapping where a, b, and c are Boolean fields: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "a" : { + "type" : "boolean" + }, + "b" : { + "type" : "boolean" + }, + "c" : { + "type" : "boolean" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with Boolean values: + +```json +PUT testindex/_doc/1 +{ + "a" : true, + "b" : "true", + "c" : "" +} +``` +{% include copy-curl.html %} + +As a result, `a` and `b` will be set to `true`, and `c` will be set to `false`. + +Search for all documents where `c` is false: + +```json +GET testindex/_search +{ + "query": { + "term" : { + "c" : false + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by Boolean field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`boost` | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field's relevance. Values between 0.0 and 1.0 decrease the field's relevance. Default is 1.0. +`doc_values` | A Boolean value that specifies whether the field should be stored on disk so that it can be used for aggregations, sorting or scripting. Default is `true`. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. +`meta` | Accepts metadata for this field. +[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. Must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. +`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the _source field. Default is `false`. + +## Boolean values in aggregations and scripts + +In aggregations on Boolean fields, `key` returns numeric values (1 for `true` or 0 for `false`), and `key_as_string` returns strings (`"true"` or `"false"`). Scripts return `true` and `false` for Boolean values. + +### Example + +Run a terms aggregation query on the field `a`: + +```json +GET testindex/_search +{ + "aggs": { + "agg1": { + "terms": { + "field": "a" + } + } + }, + "script_fields": { + "a": { + "script": { + "lang": "painless", + "source": "doc['a'].value" + } + } + } +} +``` +{% include copy-curl.html %} + +The script returns the value of `a` as `true`, `key` returns the value of `a` as `1`, and `key_as_string` returns the value of `a` as `"true"`: + +```json +{ + "took" : 1133, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "testindex", + "_type" : "_doc", + "_id" : "1", + "_score" : 1.0, + "fields" : { + "a" : [ + true + ] + } + } + ] + }, + "aggregations" : { + "agg1" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : 1, + "key_as_string" : "true", + "doc_count" : 1 + } + ] + } + } +} +``` diff --git a/_field-types/supported-field-types/completion.md b/_field-types/supported-field-types/completion.md new file mode 100644 index 00000000..9214c258 --- /dev/null +++ b/_field-types/supported-field-types/completion.md @@ -0,0 +1,392 @@ +--- +layout: default +title: Completion +nav_order: 51 +has_children: false +parent: Autocomplete field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/completion/ + - /field-types/completion/ +--- + +# Completion field type + +A completion field type provides autocomplete functionality through a completion suggester. The completion suggester is a prefix suggester, so it matches the beginning of text only. A completion suggester creates an in-memory data structure, which provides faster lookups but leads to increased memory usage. You need to upload a list of all possible completions into the index before using this feature. + +## Example + +Create a mapping with a completion field: + +```json +PUT chess_store +{ + "mappings": { + "properties": { + "suggestions": { + "type": "completion" + }, + "product": { + "type": "keyword" + } + } + } +} +``` +{% include copy-curl.html %} + +Index suggestions into OpenSearch: + +```json +PUT chess_store/_doc/1 +{ + "suggestions": { + "input": ["Books on openings", "Books on endgames"], + "weight" : 10 + } +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by completion fields. + +Parameter | Description +:--- | :--- +`input` | A list of possible completions as a string or array of strings. Cannot contain `\u0000` (null), `\u001f` (information separator one), or `\u001e` (information separator two). Required. +`weight` | A positive integer or a positive integer string for ranking suggestions. Optional. + +Multiple suggestions can be indexed as follows: + +```json +PUT chess_store/_doc/2 +{ + "suggestions": [ + { + "input": "Chess set", + "weight": 20 + }, + { + "input": "Chess pieces", + "weight": 10 + }, + { + "input": "Chess board", + "weight": 5 + } + ] +} +``` +{% include copy-curl.html %} + +As an alternative, you can use the following shorthand notation (note that you cannot provide the `weight` parameter in this notation): + +```json +PUT chess_store/_doc/3 +{ + "suggestions" : [ "Chess clock", "Chess timer" ] +} +``` +{% include copy-curl.html %} + +## Querying completion field types + +To query completion field types, specify the prefix that you want to search for and the name of the field in which to look for suggestions. + +Query the index for suggestions that start with the word "chess": + +```json +GET chess_store/_search +{ + "suggest": { + "product-suggestions": { + "prefix": "chess", + "completion": { + "field": "suggestions" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains autocomplete suggestions: + +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "product-suggestions" : [ + { + "text" : "chess", + "offset" : 0, + "length" : 5, + "options" : [ + { + "text" : "Chess set", + "_index" : "chess_store", + "_type" : "_doc", + "_id" : "2", + "_score" : 20.0, + "_source" : { + "suggestions" : [ + { + "input" : "Chess set", + "weight" : 20 + }, + { + "input" : "Chess pieces", + "weight" : 10 + }, + { + "input" : "Chess board", + "weight" : 5 + } + ] + } + }, + { + "text" : "Chess clock", + "_index" : "chess_store", + "_type" : "_doc", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "suggestions" : [ + "Chess clock", + "Chess timer" + ] + } + } + ] + } + ] + } +} +``` + +In the response, the `_score` field contains the value of the `weight` parameter that was set up at index time. The `text` field is populated with the suggestion's `input` parameter. + +By default, the response contains the whole document, including the `_source` field, which may impact performance. To return only the `suggestions` field, you can specify that in the `_source` parameter. You can also restrict the number of returned suggestions by specifying the `size` parameter. + +```json +GET chess_store/_search +{ + "_source": "suggestions", + "suggest": { + "product-suggestions": { + "prefix": "chess", + "completion": { + "field": "suggestions", + "size" : 3 + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the suggestions: + +```json +{ + "took" : 5, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "product-suggestions" : [ + { + "text" : "chess", + "offset" : 0, + "length" : 5, + "options" : [ + { + "text" : "Chess set", + "_index" : "chess_store", + "_type" : "_doc", + "_id" : "2", + "_score" : 20.0, + "_source" : { + "suggestions" : [ + { + "input" : "Chess set", + "weight" : 20 + }, + { + "input" : "Chess pieces", + "weight" : 10 + }, + { + "input" : "Chess board", + "weight" : 5 + } + ] + } + }, + { + "text" : "Chess clock", + "_index" : "chess_store", + "_type" : "_doc", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "suggestions" : [ + "Chess clock", + "Chess timer" + ] + } + } + ] + } + ] + } +} +``` + +To take advantage of source filtering, use the suggest functionality on the `_search` endpoint. The `_suggest` endpoint does not support source filtering. +{: .note} + +## Completion query parameters + +The following table lists the parameters accepted by the completion suggester query. + +Parameter | Description +:--- | :--- +`field` | A string that specifies the field on which to run the query. Required. +`size` | An integer that specifies the maximum number of returned suggestions. Optional. Default is 5. +`skip_duplicates` | A Boolean value that specifies whether to skip duplicate suggestions. Optional. Default is `false`. + +## Fuzzy completion query + +To allow for fuzzy matching, you can specify the `fuzziness` parameter for the completion query. In this case, even if the user mistypes a search term, the completion query still returns results. Additionally, the longer the prefix that matches the query, the higher the document's score. + +```json +GET chess_store/_search +{ + "suggest": { + "product-suggestions": { + "prefix": "chesc", + "completion": { + "field": "suggestions", + "size" : 3, + "fuzzy" : { + "fuzziness" : "AUTO" + } + } + } + } +} +``` +{% include copy-curl.html %} + +To use all default fuzziness options, specify `"fuzzy": {}` or `"fuzzy": true`. +{: .tip} + +The following table lists the parameters accepted by the fuzzy completion suggester query. All of the parameters are optional. + +Parameter | Description +:--- | :--- +`fuzziness` | Fuzziness can be set as one of the following:
1. An integer that specifies the maximum allowed [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) for this edit.
2. `AUTO`: Strings of 0–2 characters must match exactly, strings of 3–5 characters allow 1 edit, and strings longer than 5 characters allow 2 edits.
Default is `AUTO`. +`min_length` | An integer that specifies the minimum length the input must be to start returning suggestions. If the search term is shorter than `min_length`, no suggestions are returned. Default is 3. +`prefix_length` | An integer that specifies the minimum length the matched prefix must be to start returning suggestions. If the prefix of `prefix_length` is not matched, but the search term is still within the Levenshtein distance, no suggestions are returned. Default is 1. +`transpositions` | A Boolean value that specifies to count transpositions (interchanges of adjacent characters) as one edit instead of two. Example: The suggestion's `input` parameter is `abcde` and the `fuzziness` is 1. If `transpositions` is set to `true`, `abdce` will match, but if `transpositions` is set to `false`, `abdce` will not match. Default is `true`. +`unicode_aware` | A Boolean value that specifies whether to use Unicode code points when measuring the edit distance, transposition, and length. If `unicode_aware` is set to `true`, the measurement is slower. Default is `false`, in which case distances are measured in bytes. + +## Regex queries + +You can use a regular expression to define the prefix for the completion suggester query. + +For example, to search for strings that start with "a" and have a "d" later on, use the following query: + +```json +GET chess_store/_search +{ + "suggest": { + "product-suggestions": { + "regex": "a.*d", + "completion": { + "field": "suggestions" + } + } + } +} +``` +{% include copy-curl.html %} + +The response matches the string "abcde": + +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "product-suggestions" : [ + { + "text" : "a.*d", + "offset" : 0, + "length" : 4, + "options" : [ + { + "text" : "abcde", + "_index" : "chess_store", + "_type" : "_doc", + "_id" : "2", + "_score" : 20.0, + "_source" : { + "suggestions" : [ + { + "input" : "abcde", + "weight" : 20 + } + ] + } + } + ] + } + ] + } +} +``` \ No newline at end of file diff --git a/_field-types/supported-field-types/date-nanos.md b/_field-types/supported-field-types/date-nanos.md new file mode 100644 index 00000000..12399a69 --- /dev/null +++ b/_field-types/supported-field-types/date-nanos.md @@ -0,0 +1,290 @@ +--- +layout: default +title: Date nanoseconds +nav_order: 35 +has_children: false +parent: Date field types +grand_parent: Supported field types +--- + +# Date nanoseconds field type + +The `date_nanos` field type is similar to the [`date`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/) field type in that it holds a date. However, `date` stores the date in millisecond resolution, while `date_nanos` stores the date in nanosecond resolution. Dates are stored as `long` values that correspond to nanoseconds since the epoch. Therefore, the range of supported dates is approximately 1970--2262. + +Queries on `date_nanos` fields are converted to range queries on the field value's `long` representation. Then the stored fields and aggregation results are converted to a string using the format set on the field. + +The `date_nanos` field supports all [formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date#formats) and [parameters]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date#parameters) that `date` supports. You can use multiple formats separated by `||`. +{: .note} + +For `date_nanos` fields, you can use the `strict_date_optional_time_nanos` format to preserve nanosecond resolution. If you don't specify the format when mapping a field as `date_nanos`, the default format is `strict_date_optional_time||epoch_millis` that lets you pass values in either `strict_date_optional_time` or `epoch_millis` format. The `strict_date_optional_time` format supports dates in nanosecond resolution, but the `epoch_millis` format supports dates in millisecond resolution only. + +## Example + +Create a mapping with the `date` field of type `date_nanos` that has the `strict_date_optional_time_nanos` format: + +```json +PUT testindex/_mapping +{ + "properties": { + "date": { + "type": "date_nanos", + "format" : "strict_date_optional_time_nanos" + } + } +} +``` +{% include copy-curl.html %} + +Index two documents into the index: + +```json +PUT testindex/_doc/1 +{ "date": "2022-06-15T10:12:52.382719622Z" } +``` +{% include copy-curl.html %} + +```json +PUT testindex/_doc/2 +{ "date": "2022-06-15T10:12:52.382719624Z" } +``` +{% include copy-curl.html %} + +You can use a range query to search for a date range: + +```json +GET testindex/_search +{ + "query": { + "range": { + "date": { + "gte": "2022-06-15T10:12:52.382719621Z", + "lte": "2022-06-15T10:12:52.382719623Z" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the document whose date is in the specified range: + +```json +{ + "took": 43, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_score": 1, + "_source": { + "date": "2022-06-15T10:12:52.382719622Z" + } + } + ] + } +} +``` + +When querying documents with `date_nanos` fields, you can use `fields` or `docvalue_fields`: + +```json +GET testindex/_search +{ + "fields": ["date"] +} +``` +{% include copy-curl.html %} + +```json +GET testindex/_search +{ + "docvalue_fields" : [ + { + "field" : "date" + } + ] +} +``` +{% include copy-curl.html %} + +The response to either of the preceding queries contains both indexed documents: + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_score": 1, + "_source": { + "date": "2022-06-15T10:12:52.382719622Z" + }, + "fields": { + "date": [ + "2022-06-15T10:12:52.382719622Z" + ] + } + }, + { + "_index": "testindex", + "_id": "2", + "_score": 1, + "_source": { + "date": "2022-06-15T10:12:52.382719624Z" + }, + "fields": { + "date": [ + "2022-06-15T10:12:52.382719624Z" + ] + } + } + ] + } +} +``` + +You can sort on a `date_nanos` field as follows: + +```json +GET testindex/_search +{ + "sort": { + "date": "asc" + } +} +``` +{% include copy-curl.html %} + +The response contains the sorted documents: + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": null, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_score": null, + "_source": { + "date": "2022-06-15T10:12:52.382719622Z" + }, + "sort": [ + 1655287972382719700 + ] + }, + { + "_index": "testindex", + "_id": "2", + "_score": null, + "_source": { + "date": "2022-06-15T10:12:52.382719624Z" + }, + "sort": [ + 1655287972382719700 + ] + } + ] + } +} +``` + +You can also use a Painless script to access the nanoseconds part of the field: + +```json +GET testindex/_search +{ + "script_fields" : { + "my_field" : { + "script" : { + "lang" : "painless", + "source" : "doc['date'].value.nano" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains only the nanosecond parts of the fields: + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_score": 1, + "fields": { + "my_field": [ + 382719622 + ] + } + }, + { + "_index": "testindex", + "_id": "2", + "_score": 1, + "fields": { + "my_field": [ + 382719624 + ] + } + } + ] + } +} +``` \ No newline at end of file diff --git a/_field-types/supported-field-types/date.md b/_field-types/supported-field-types/date.md new file mode 100644 index 00000000..e0b99f3a --- /dev/null +++ b/_field-types/supported-field-types/date.md @@ -0,0 +1,359 @@ +--- +layout: default +title: Date +nav_order: 25 +has_children: false +parent: Date field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/date/ + - /field-types/date/ +--- + +# Date field type + +A date in OpenSearch can be represented as one of the following: + +- A long value that corresponds to milliseconds since the epoch (the value must be non-negative). Dates are stored in this form internally. +- A formatted string. +- An integer value that corresponds to seconds since the epoch (the value must be non-negative). + +To represent date ranges, there is a date [range field type]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/range/). +{: .note } + +## Example + +Create a mapping with a date field and two date formats: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "release_date" : { + "type" : "date", + "format" : "strict_date_optional_time||epoch_millis" + } + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by date field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`boost` | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field's relevance. Values between 0.0 and 1.0 decrease the field's relevance. Default is 1.0. +`doc_values` | A Boolean value that specifies whether the field should be stored on disk so that it can be used for aggregations, sorting, or scripting. Default is `false`. +`format` | The format for parsing dates. Default is `strict_date_time_no_millis||strict_date_optional_time||epoch_millis`. +`ignore_malformed` | A Boolean value that specifies to ignore malformed values and not to throw an exception. Default is `false`. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. +`locale` | A region- and language-specific way of representing the date. Default is [`ROOT`](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html#ROOT) (a region- and language-neutral locale). +`meta` | Accepts metadata for this field. +[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. Must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. +`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the _source field. Default is `false`. + +## Formats + +OpenSearch has built-in date formats, but you can also create your own custom formats. You can specify multiple date formats, separated by `||`. + +## Default format + +As of OpenSearch 2.12, the default date format is `strict_date_time_no_millis||strict_date_optional_time||epoch_millis`. To revert the default format back to `strict_date_optional_time||epoch_millis` (the default format for OpenSearch 2.11 and earlier), set the `opensearch.experimental.optimization.datetime_formatter_caching.enabled` feature flag to `false`. For more information about enabling and disabling feature flags, see [Enabling experimental features]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/experimental/). + +## Built-in formats + +Most of the date formats have a `strict_` counterpart. When the format starts with `strict_`, the date must have the correct number of digits specified in the format. For example, if the format is set to `strict_year_month_day` (`"yyyy-MM-dd"`), both month and day have to be two-digit numbers. So, `"2020-06-09"` is valid, while `"2020-6-9"` is invalid. + +Epoch is defined as 00:00:00 UTC on January 1, 1970. +{: .note } + +y: year
+Y: [week-based year](https://en.wikipedia.org/wiki/ISO_8601#Week_dates)
+M: month
+w: ordinal [week of the year](https://en.wikipedia.org/wiki/ISO_8601#Week_dates) from 01 to 53
+d: day
+D: ordinal day of the year from 001 to 365 (366 for leap years)
+e: ordinal day of the week from 1 (Monday) to 7 (Sunday)
+H: hour from 0 to 23
+m: minute
+s: second
+S: fraction of a second
+Z: time zone offset (for example, +0400; -0400; -04:00)
+{: .note } + +### Numeric date formats + +Format name and description | Examples +:--- | :--- +`epoch_millis`
The number of milliseconds since the epoch. Minimum is -263. Maximum is 263 − 1. | 1553391286000 +`epoch_second`
The number of seconds since the epoch. Minimum is -263 ÷ 1000. Maximum is (263 − 1) ÷ 1000. | 1553391286 + +### Basic date formats + +Components of basic date formats are not separated by a delimiter. For example, "20190323". + +Format name and description | Pattern and examples +:--- | :--- +**Dates**| +`basic_date_time`
A basic date and time separated by `T`. | `"yyyyMMdd`T`HHmmss.SSSZ"`
`"20190323T213446.123-04:00"` +`basic_date_time_no_millis`
A basic date and time without milliseconds, separated by `T`. | `"yyyyMMdd`T`HHmmssZ"`
`"20190323T213446-04:00"` +`basic_date`
A date with a four-digit year, two-digit month, and two-digit day. | `"yyyyMMdd"
"20190323"` +**Times** | +`basic_time`
A time with a two-digit hour, two-digit minute, two-digit second, three-digit millisecond, and time zone offset. |`"HHmmss.SSSZ"`
`"213446.123-04:00"` +`basic_time_no_millis`
A basic time without milliseconds. | `"HHmmssZ"`
`"213446-04:00"` +**T times** | +`basic_t_time`
A basic time preceded by `T`. | `"`T`HHmmss.SSSZ"`
`"T213446.123-04:00"` +`basic_t_time_no_millis`
A basic time without milliseconds, preceded by `T`. | `"`T`HHmmssZ"`
`"T213446-04:00"` +**Ordinal dates** | +`basic_ordinal_date_time`
A full ordinal date and time. | `"yyyyDDD`T`HHmmss.SSSZ"`
`"2019082T213446.123-04:00"` +`basic_ordinal_date_time_no_millis`
A full ordinal date and time without milliseconds. | `"yyyyDDD`T`HHmmssZ"`
`"2019082T213446-04:00"` +`basic_ordinal_date`
A date with a four-digit year and three-digit ordinal day of the year. | `"yyyyDDD"`
`"2019082"` +**Week-based dates** | +`basic_week_date_time`
`strict_basic_week_date_time`
A full week-based date and time separated by `T`. | `"YYYY`W`wwe`T`HHmmss.SSSZ"`
`"2019W126213446.123-04:00"` +`basic_week_date_time_no_millis`
`strict_basic_week_date_time_no_millis`
A basic week-based year date and time without milliseconds, separated by `T`. | `"YYYY`W`wwe`T`HHmmssZ"`
"2019W126213446-04:00" +`basic_week_date`
`strict_basic_week_date`
A full week-based date with a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week separated by `W`. | `"YYYY`W`wwe"`
`"2019W126"` + +### Full date formats + +Components of full date formats are separated by a `-` delimiter for date and `:` delimiter for time. For example, `"2019-03-23T21:34"`. + +Format name and description | Pattern and examples +:--- | :--- +**Dates** | +`date_optional_time`
`strict_date_optional_time`
A generic full date and time. Year is required. Month, day, and time are optional. Time is separated from date by `T`. | Multiple patterns.
`"2019--03--23T21:34:46.123456789--04:00"`
`"2019-03-23T21:34:46"`
`"2019-03-23T21:34"`
`"2019"` +`strict_date_optional_time_nanos`
A generic full date and time. Year is required. Month, day, and time are optional. If time is specified, it must contain hours, minutes, and seconds, but fraction of a second is optional. Fraction of a second is one to nine digits long and has nanosecond resolution. Time is separated from date by `T`. | Multiple patterns.
`"2019-03-23T21:34:46.123456789-04:00"`
`"2019-03-23T21:34:46"`
`"2019"` +`date_time`
`strict_date_time`
A full date and time separated by `T`. | `"yyyy-MM-dd`T`HH:mm:ss.SSSZ"`
`"2019-03-23T21:34:46.123-04:00"` +`date_time_no_millis`
`strict_date_time_no_millis`
A full date and time without milliseconds, separated by `T`. | `"yyyy-MM-dd'T'HH:mm:ssZ"`
`"2019-03-23T21:34:46-04:00"` +`date_hour_minute_second_fraction`
`strict_date_hour_minute_second_fraction`
A full date, two-digit hour, two-digit minute, two-digit second, and one- to nine-digit fraction of a second separated by `T`. | `"yyyy-MM-dd`T`HH:mm:ss.SSSSSSSSS"`
`"2019-03-23T21:34:46.123456789"`
`"2019-03-23T21:34:46.1"` +`date_hour_minute_second_millis`
`strict_date_hour_minute_second_millis`
A full date, two-digit hour, two-digit minute, two-digit second, and three-digit millisecond separated by `T`. | `"yyyy-MM-dd`T`HH:mm:ss.SSS"`
`"2019-03-23T21:34:46.123"` +`date_hour_minute_second`
`strict_date_hour_minute_second`
A full date, two-digit hour, two-digit minute, and two-digit second separated by `T`.| `"yyyy-MM-dd`T`HH:mm:ss"`
`"2019-03-23T21:34:46"` +`date_hour_minute`
`strict_date_hour_minute`
A full date, two-digit hour, and two-digit minute. | `"yyyy-MM-dd`T`HH:mm"`
`"2019-03-23T21:34"` +`date_hour`
`strict_date_hour`
A full date and two-digit hour, separated by `T`. | `"yyyy-MM-dd`T`HH"`
`"2019-03-23T21"` +`date`
`strict_date`
A four-digit year, two-digit month, and two-digit day. | `"yyyy-MM-dd"`
`"2019-03-23"` +`year_month_day`
`strict_year_month_day`
A four-digit year, two-digit month, and two-digit day. | `"yyyy-MM-dd"`
`"2019-03-23"` +`year_month`
`strict_year_month`
A four-digit year and two-digit month. | `"yyyy-MM"`
`"2019-03"` +`year`
`strict_year`
A four-digit year. | `"yyyy"`
`"2019"` +`rfc3339_lenient`
An RFC3339 compatible DateTimeFormatter which is much faster than other full date-lenient formats like `strict_date_optional_time` | `"YYYY"`
`"2019"`
`"YYYY-MM"`
`"2019-03"`
`"YYYY-MM-DD"`
`"2019-03-23"`
`"YYYY-MM-DDThh:mmTZD"`
`"2019-03-23T21:34Z"`
`"YYYY-MM-DDThh:mm:ssTZD"`
`"2019-03-23T21:34:46Z"`
`"YYYY-MM-DDThh:mm:ss.sTZD"`
`"2019-03-23T21:34:46.123456789-04:00"`
`"YYYY-MM-DDThh:mm:ss,sTZD"`
`"2019-03-23T21:34:46,123456789-04:00"` +**Times** | +`time`
`strict_time`
A two-digit hour, two-digit minute, two-digit second, one- to nine-digit fraction of a second, and time zone offset. | `"HH:mm:ss.SSSSSSSSSZ"`
`"21:34:46.123456789-04:00"`
`"21:34:46.1-04:00"` +`time_no_millis`
`strict_time_no_millis`
A two-digit hour, two-digit minute, two-digit second, and time zone offset. | `"HH:mm:ssZ"`
`"21:34:46-04:00"` +`hour_minute_second_fraction`
`strict_hour_minute_second_fraction`
A two-digit hour, two-digit minute, two-digit second, and one- to nine-digit fraction of a second. | `"HH:mm:ss.SSSSSSSSS"`
`"21:34:46.1"`
`"21:34:46.123456789"` +`hour_minute_second_millis`
`strict_hour_minute_second_millis`
A two-digit hour, two-digit minute, two-digit second, and three-digit millisecond. | `"HH:mm:ss.SSS"`
`"21:34:46.123"` +`hour_minute_second`
`strict_hour_minute_second`
A two-digit hour, two-digit minute, and two-digit second. | `"HH:mm:ss"`
`"21:34:46"` +`hour_minute`
`strict_hour_minute`
A two-digit hour and two-digit minute. | `"HH:mm"`
`"21:34"` +`hour`
`strict_hour`
A two-digit hour. | `"HH"`
`"21"` +**T times** | +`t_time`
`strict_t_time`
A two-digit hour, two-digit minute, two-digit second, one- to nine-digit fraction of a second, and time zone offset, preceded by `T`. | `"`T`HH:mm:ss.SSSSSSSSSZ"
"T21:34:46.123456789-04:00"`
`"T21:34:46.1-04:00"` +`t_time_no_millis`
`strict_t_time_no_millis`
A two-digit hour, two-digit minute, two-digit second, and time zone offset, preceded by `T`. | `"`T`HH:mm:ssZ"`
`"T21:34:46-04:00"` +**Ordinal dates** | +`ordinal_date_time`
`strict_ordinal_date_time`
A full ordinal date and time separated by `T`. | `"yyyy-DDD`T`HH:mm:ss.SSSZ"`
`"2019-082T21:34:46.123-04:00"` +`ordinal_date_time_no_millis`
`strict_ordinal_date_time_no_millis`
A full ordinal date and time without milliseconds, separated by `T`. | `"yyyy-DDD`T`HH:mm:ssZ"`
`"2019-082T21:34:46-04:00"` +`ordinal_date`
`strict_ordinal_date`
A full ordinal date with a four-digit year and three-digit ordinal day of the year. | `"yyyy-DDD"`
`"2019-082"` +**Week-based dates** | +`week_date_time`
`strict_week_date_time`
A full week-based date and time separated by `T`. Week date is a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week. Time is a two-digit hour, two-digit minute, two-digit second, one- to nine-digit fraction of a second, and a time zone offset. | `"YYYY-`W`ww-e`T`HH:mm:ss.SSSSSSSSSZ"`
`"2019-W12-6T21:34:46.1-04:00"`
`"2019-W12-6T21:34:46.123456789-04:00"` +`week_date_time_no_millis`
`strict_week_date_time_no_millis`
A full week-based date and time without milliseconds, separated by `T`. Week date is a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week. Time is a two-digit hour, two-digit minute, two-digit second, and time zone offset. | `"YYYY-`W`ww-e`T`HH:mm:ssZ"`
`"2019-W12-6T21:34:46-04:00"` +`week_date`
`strict_week_date`
A full week-based date with a four-digit week-based year, two-digit ordinal week of the year, and one-digit ordinal day of the week. | `"YYYY-`W`ww-e"`
`"2019-W12-6"` +`weekyear_week_day`
`strict_weekyear_week_day`
A four-digit week-based year, two-digit ordinal week of the year, and one digit day of the week. | `"YYYY-'W'ww-e"`
`"2019-W12-6"` +`weekyear_week`
`strict_weekyear_week`
A four-digit week-based year and two-digit ordinal week of the year. | `"YYYY-`W`ww"`
`"2019-W12"` +`weekyear`
`strict_weekyear`
A four-digit week-based year. | `"YYYY"`
`"2019"` + +## Custom formats + +You can create custom formats for date fields. For example, the following request specifies a date in the common `"MM/dd/yyyy"` format: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "release_date" : { + "type" : "date", + "format" : "MM/dd/yyyy" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with a date: + +```json +PUT testindex/_doc/21 +{ + "release_date" : "03/21/2019" +} +``` +{% include copy-curl.html %} + +When searching for an exact date, provide that date in the same format: + +```json +GET testindex/_search +{ + "query" : { + "match": { + "release_date" : { + "query": "03/21/2019" + } + } + } +} +``` +{% include copy-curl.html %} + +Range queries by default use the field's mapped format. You can also specify the range of dates in a different format by providing the `format` parameter: + +```json +GET testindex/_search +{ + "query": { + "range": { + "release_date": { + "gte": "2019-01-01", + "lte": "2019-12-31", + "format": "yyyy-MM-dd" + } + } + } +} +``` +{% include copy-curl.html %} + +## Date math + +The date field type supports using date math to specify durations in queries. For example, the `gt`, `gte`, `lt`, and `lte` parameters in [range queries]({{site.url}}{{site.baseurl}}/query-dsl/term/range/) and the `from` and `to` parameters in [date range aggregations]({{site.url}}{{site.baseurl}}/query-dsl/aggregations/bucket/date-range/) accept date math expressions. + +A date math expression contains a fixed date, optionally followed by one or more mathematical expressions. The fixed date may be either `now` (current date and time in milliseconds since the epoch) or a string ending with `||` that specifies a date (for example, `2022-05-18||`). The date must be in the [default format](#default-format) (which is `strict_date_time_no_millis||strict_date_optional_time||epoch_millis` by default). + +If you specify multiple date formats in the field mapping, OpenSearch uses the first format to convert the milliseconds since the epoch value to a string.
+If a field mapping for a field contains no format, OpenSearch uses the `strict_date_optional_time` format to convert the epoch value to a string. +{: .note} + +Date math supports the following mathematical operators. + +Operator | Description | Example +:--- | :--- | :--- +`+` | Addition | `+1M`: Add 1 month. +`-` | Subtraction | `-1y`: Subtract 1 year. +`/` | Rounding down | `/h`: Round to the beginning of the hour. + +Date math supports the following time units: + +`y`: Years
+`M`: Months
+`w`: Weeks
+`d`: Days
+`h` or `H`: Hours
+`m`: Minutes
+`s`: Seconds +{: .note } + +### Example expressions + +The following example expressions illustrate using date math: + +- `now+1M`: The current date and time in milliseconds since the epoch, plus 1 month. +- `2022-05-18||/M`: `05/18/2022`, rounded to the beginning of the month. Resolves to `2022-05-01`. +- `2022-05-18T15:23||/h`: `15:23` on `05/18/2022`, rounded to the beginning of the hour. Resolves to `2022-05-18T15`. +- `2022-05-18T15:23:17.789||+2M-1d/d`: `15:23:17.789` on `05/18/2022` plus 2 months minus 1 day, rounded to the beginning of the day. Resolves to `2022-07-17`. + + +### Using date math in a range query + +The following example illustrates using date math in a [range query]({{site.url}}{{site.baseurl}}/query-dsl/term/range/). + +Set up an index with `release_date` mapped as `date`: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "release_date" : { + "type" : "date" + } + } + } +} +``` +{% include copy-curl.html %} + +Index two documents into the index: + +```json +PUT testindex/_doc/1 +{ + "release_date": "2022-09-14" +} +``` +{% include copy-curl.html %} + +```json +PUT testindex/_doc/2 +{ + "release_date": "2022-11-15" +} +``` +{% include copy-curl.html %} + +The following query searches for documents with `release_date` within 2 months and 1 day of `09/14/2022`. The lower boundary of the range is rounded to the beginning of the day on `09/14/2022`: + +```json +GET testindex/_search +{ + "query": { + "range": { + "release_date": { + "gte": "2022-09-14T15:23||/d", + "lte": "2022-09-14||+2M+1d" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains both documents: + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "testindex", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "release_date" : "2022-11-14" + } + }, + { + "_index" : "testindex", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "release_date" : "2022-09-14" + } + } + ] + } +} +``` diff --git a/_field-types/supported-field-types/dates.md b/_field-types/supported-field-types/dates.md new file mode 100644 index 00000000..7c6e47cb --- /dev/null +++ b/_field-types/supported-field-types/dates.md @@ -0,0 +1,17 @@ +--- +layout: default +title: Date field types +nav_order: 25 +has_children: true +has_toc: false +parent: Supported field types +--- + +# Date field types + +Date field types contain a date value that can be formatted using different date formats. The following table lists all date field types that OpenSearch supports. + +Field data type | Description +:--- | :--- +[`date`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/) | A date stored in millisecond resolution. +[`date_nanos`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date-nanos/) | A date stored in nanosecond resolution. diff --git a/_field-types/supported-field-types/flat-object.md b/_field-types/supported-field-types/flat-object.md new file mode 100644 index 00000000..933c385a --- /dev/null +++ b/_field-types/supported-field-types/flat-object.md @@ -0,0 +1,244 @@ +--- +layout: default +title: Flat object +nav_order: 43 +has_children: false +parent: Object field types +grand_parent: Supported field types +redirect_from: + - /field-types/flat-object/ +--- + +# Flat object field type + +In OpenSearch, you don't have to specify a mapping before indexing documents. If you don't specify a mapping, OpenSearch uses [dynamic mapping]({{site.url}}{{site.baseurl}}/field-types/index#dynamic-mapping) to map every field and its subfields in the document automatically. When you ingest documents such as logs, you may not know every field's subfield name and type in advance. In this case, dynamically mapping all new subfields can quickly lead to a "mapping explosion," where the growing number of fields may degrade the performance of your cluster. + +The flat object field type solves this problem by treating the entire JSON object as a string. Subfields within the JSON object are accessible using standard dot path notation, but they are not indexed for fast lookup. + +The maximum field value length in the dot notation is 224 − 1. +{: .note} + +The flat object field type provides the following benefits: + +- Efficient reads: Fetching performance is similar to that of a keyword field. +- Memory efficiency: Storing the entire complex JSON object in one field without indexing all of its subfields reduces the number of fields in an index. +- Space efficiency: OpenSearch does not create an inverted index for subfields in flat objects, thereby saving space. +- Compatibility for migration: You can migrate your data from systems that support similar flat types to OpenSearch. + +Mapping a field as a flat object applies when a field and its subfields are mostly read and not used as search criteria because the subfields are not indexed. Flat objects are useful for objects with a large number of fields or when you don't know the keys in advance. + +Flat objects support exact match queries with and without dot path notation. For a complete list of supported query types, see [Supported queries](#supported-queries). + +Searching for a specific value of a nested field in a document may be inefficient because it may require a full scan of the index, which can be an expensive operation. +{: .note} + +Flat objects do not support: + +- Type-specific parsing. +- Numerical operations, such as numerical comparison or numerical sorting. +- Text analysis. +- Highlighting. +- Aggregations of subfields using dot notation. +- Filtering by subfields. + +## Supported queries + +The flat object field type supports the following queries: + +- [Term]({{site.url}}{{site.baseurl}}/query-dsl/term/term/) +- [Terms]({{site.url}}{{site.baseurl}}/query-dsl/term/terms/) +- [Terms set]({{site.url}}{{site.baseurl}}/query-dsl/term/terms-set/) +- [Prefix]({{site.url}}{{site.baseurl}}/query-dsl/term/prefix/) +- [Range]({{site.url}}{{site.baseurl}}/query-dsl/term/range/) +- [Match]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match/) +- [Multi-match]({{site.url}}{{site.baseurl}}/query-dsl/full-text/multi-match/) +- [Query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) +- [Simple query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/simple-query-string/) +- [Exists]({{site.url}}{{site.baseurl}}/query-dsl/term/exists/) + +## Limitations + +The following limitations apply to flat objects in OpenSearch 2.7: + +- Flat objects do not support open parameters. +- Painless scripting and wildcard queries are not supported for retrieving values of subfields. + +This functionality is planned for a future release. + +## Using flat object + +The following example illustrates mapping a field as a flat object, indexing documents with flat object fields, and searching for leaf values of the flat object in those documents. + +First, create a mapping for your index, where `issue` is of type `flat_object`: + +```json +PUT /test-index/ +{ + "mappings": { + "properties": { + "issue": { + "type": "flat_object" + } + } + } +} +``` +{% include copy-curl.html %} + +Next, index two documents with flat object fields: + +```json +PUT /test-index/_doc/1 +{ + "issue": { + "number": "123456", + "labels": { + "version": "2.1", + "backport": [ + "2.0", + "1.3" + ], + "category": { + "type": "API", + "level": "enhancement" + } + } + } +} +``` +{% include copy-curl.html %} + +```json +PUT /test-index/_doc/2 +{ + "issue": { + "number": "123457", + "labels": { + "version": "2.2", + "category": { + "type": "API", + "level": "bug" + } + } + } +} +``` +{% include copy-curl.html %} + +To search for a leaf value of the flat object, use either a GET or a POST request. Even if you don't know the field names, you can search for a leaf value in the entire flat object. For example, the following request searches for all issues labeled as bugs: + +```json +GET /test-index/_search +{ + "query": { + "match": {"issue": "bug"} + } +} +``` + +Alternatively, if you know the subfield name in which to search, provide the field's path in dot notation: + +```json +GET /test-index/_search +{ + "query": { + "match": {"issue.labels.category.level": "bug"} + } +} +``` +{% include copy-curl.html %} + +In both cases, the response is the same and contains document 2: + +```json +{ + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.0303539, + "hits": [ + { + "_index": "test-index", + "_id": "2", + "_score": 1.0303539, + "_source": { + "issue": { + "number": "123457", + "labels": { + "version": "2.2", + "category": { + "type": "API", + "level": "bug" + } + } + } + } + } + ] + } +} +``` + +Using a prefix query, you can search for all issues for the versions that start with `2.`: + +```json +GET /test-index/_search +{ + "query": { + "prefix": {"issue.labels.version": "2."} + } +} +``` + +With a range query, you can search for all issues for versions 2.0--2.1: + +```json +GET /test-index/_search +{ + "query": { + "range": { + "issue": { + "gte": "2.0", + "lte": "2.1" + } + } + } +} +``` + +## Defining a subfield as a flat object + +You can define a subfield of a JSON object as a flat object. For example, use the following query to define the `issue.labels` as `flat_object`: + +```json +PUT /test-index/ +{ + "mappings": { + "properties": { + "issue": { + "properties": { + "number": { + "type": "double" + }, + "labels": { + "type": "flat_object" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Because `issue.number` is not part of the flat object, you can use it to aggregate and sort documents. \ No newline at end of file diff --git a/_field-types/supported-field-types/geo-point.md b/_field-types/supported-field-types/geo-point.md new file mode 100644 index 00000000..0912dc61 --- /dev/null +++ b/_field-types/supported-field-types/geo-point.md @@ -0,0 +1,113 @@ +--- +layout: default +title: Geopoint +nav_order: 56 +has_children: false +parent: Geographic field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/geo-point/ + - /field-types/geo-point/ +--- + +# Geopoint field type + +A geopoint field type contains a geographic point specified by latitude and longitude. + +## Example + +Create a mapping with a geopoint field type: + +```json +PUT testindex1 +{ + "mappings": { + "properties": { + "point": { + "type": "geo_point" + } + } + } +} +``` +{% include copy-curl.html %} + +## Formats + +Geopoints can be indexed in the following formats: + +- An object with a latitude and longitude + +```json +PUT testindex1/_doc/1 +{ + "point": { + "lat": 40.71, + "lon": 74.00 + } +} +``` +{% include copy-curl.html %} + +- A string in the "`latitude`,`longitude`" format + +```json +PUT testindex1/_doc/2 +{ + "point": "40.71,74.00" +} +``` +{% include copy-curl.html %} + +- A geohash + +```json +PUT testindex1/_doc/3 +{ + "point": "txhxegj0uyp3" +} +``` +{% include copy-curl.html %} + +- An array in the [`longitude`, `latitude`] format + +```json +PUT testindex1/_doc/4 +{ + "point": [74.00, 40.71] +} +``` +{% include copy-curl.html %} + +- A [Well-Known Text](https://docs.opengeospatial.org/is/12-063r5/12-063r5.html) POINT in the "POINT(`longitude` `latitude`)" format + +```json +PUT testindex1/_doc/5 +{ + "point": "POINT (74.00 40.71)" +} +``` +{% include copy-curl.html %} + +- GeoJSON format, where the `coordinates` are in the [`longitude`, `latitude`] format + +```json +PUT testindex1/_doc/6 +{ + "point": { + "type": "Point", + "coordinates": [74.00, 40.71] + } +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by geopoint field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`ignore_malformed` | A Boolean value that specifies to ignore malformed values and not to throw an exception. Valid values for latitude are [-90, 90]. Valid values for longitude are [-180, 180]. Default is `false`. +`ignore_z_value` | Specific to points with three coordinates. If `ignore_z_value` is `true`, the third coordinate is not indexed but is still stored in the _source field. If `ignore_z_value` is `false`, an exception is thrown. +[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. Must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. \ No newline at end of file diff --git a/_field-types/supported-field-types/geo-shape.md b/_field-types/supported-field-types/geo-shape.md new file mode 100644 index 00000000..cbf63551 --- /dev/null +++ b/_field-types/supported-field-types/geo-shape.md @@ -0,0 +1,404 @@ +--- +layout: default +title: Geoshape +nav_order: 57 +has_children: false +parent: Geographic field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/geo-shape/ + - /field-types/geo-shape/ +--- + +# Geoshape field type + +A geoshape field type contains a geographic shape, such as a polygon or a collection of geographic points. To index a geoshape, OpenSearch tesselates the shape into a triangular mesh and stores each triangle in a BKD tree. This provides a 10-7decimal degree of precision, which represents near-perfect spatial resolution. Performance of this process is mostly impacted by the number of vertices in a polygon you are indexing. + +## Example + +Create a mapping with a geoshape field type: + +```json +PUT testindex +{ + "mappings": { + "properties": { + "location": { + "type": "geo_shape" + } + } + } +} +``` +{% include copy-curl.html %} + +## Formats + +Geoshapes can be indexed in the following formats: + +- [GeoJSON](https://geojson.org/) +- [Well-Known Text (WKT)](https://docs.opengeospatial.org/is/12-063r5/12-063r5.html) + +In both GeoJSON and WKT, the coordinates must be specified in the `longitude, latitude` order within coordinate arrays. Note that the longitude comes first in this format. +{: .note} + +## Geoshape types + +The following table describes the possible geoshape types and their relationship to the GeoJSON and WKT types. + +OpenSearch type | GeoJSON type | WKT type | Description +:--- | :--- | :--- | :--- +[`point`](#point) | Point | POINT | A geographic point specified by latitude and longitude. OpenSearch uses World Geodetic System (WGS84) coordinates. +[`linestring`](#linestring) | LineString | LINESTRING | A line specified by two or more points. May be a straight line or a path of connected line segments. +[`polygon`](#polygon) | Polygon | POLYGON | A polygon specified by a list of vertices in coordinate form. The polygon must be closed, meaning the last point must be the same as the first point. Therefore, to create an n-gon, n+1 vertices are required. The minimum number of vertices is four, which creates a triangle. +[`multipoint`](#multipoint) | MultiPoint | MULTIPOINT | An array of discrete related points that are not connected. +[`multilinestring`](#multilinestring) | MultiLineString | MULTILINESTRING | An array of linestrings. +[`multipolygon`](#multipolygon) | MultiPolygon | MULTIPOLYGON | An array of polygons. +[`geometrycollection`](#geometry-collection) | GeometryCollection | GEOMETRYCOLLECTION | A collection of geoshapes that may be of different types. +[`envelope`](#envelope) | N/A | BBOX | A bounding rectangle specified by upper-left and lower-right vertices. + +## Point + +A point is a single pair of coordinates specified by latitude and longitude. + +Index a point in GeoJSON format: + +```json +PUT testindex/_doc/1 +{ + "location" : { + "type" : "point", + "coordinates" : [74.00, 40.71] + } +} +``` +{% include copy-curl.html %} + +Index a point in WKT format: + +```json +PUT testindex/_doc/1 +{ + "location" : "POINT (74.0060 40.7128)" +} +``` +{% include copy-curl.html %} + +## Linestring + +A linestring is a line specified by two or more points. If the points are collinear, the linestring is a straight line. Otherwise, the linestring represents a path made of line segments. + +Index a linestring in GeoJSON format: + +```json +PUT testindex/_doc/2 +{ + "location" : { + "type" : "linestring", + "coordinates" : [[74.0060, 40.7128], [71.0589, 42.3601]] + } +} +``` +{% include copy-curl.html %} + +Index a linestring in WKT format: + +```json +PUT testindex/_doc/2 +{ + "location" : "LINESTRING (74.0060 40.7128, 71.0589 42.3601)" +} +``` +{% include copy-curl.html %} + +## Polygon + +A polygon is specified by a list of vertices in coordinate form. The polygon must be closed, meaning the last point must be the same as the first point. In the following example, a triangle is created using four points. + +GeoJSON requires that you list the vertices of the polygon counterclockwise. WKT does not impose a specific order on vertices. +{: .note} + +Index a polygon (triangle) in GeoJSON format: + +```json +PUT testindex/_doc/3 +{ + "location" : { + "type" : "polygon", + "coordinates" : [ + [[74.0060, 40.7128], + [71.0589, 42.3601], + [73.7562, 42.6526], + [74.0060, 40.7128]] + ] + } +} +``` +{% include copy-curl.html %} + +Index a polygon (triangle) in WKT format: + +```json +PUT testindex/_doc/3 +{ + "location" : "POLYGON ((74.0060 40.7128, 71.0589 42.3601, 73.7562 42.6526, 74.0060 40.7128))" +} +``` +{% include copy-curl.html %} + +The polygon may have holes inside. In this case, the `coordinates` field will contain multiple arrays. The first array represents the outer polygon, and each subsequent array represents a hole. Holes are represented as polygons and specified as arrays of coordinates. + +GeoJSON requires that you list the vertices of the polygon counterclockwise and the vertices of the hole clockwise. WKT does not impose a specific order on vertices. +{: .note} + +Index a polygon (triangle) with a triangular hole in GeoJSON format: + +```json +PUT testindex/_doc/4 +{ + "location" : { + "type" : "polygon", + "coordinates" : [ + [[74.0060, 40.7128], + [71.0589, 42.3601], + [73.7562, 42.6526], + [74.0060, 40.7128]], + + [[72.6734,41.7658], + [72.6506, 41.5623], + [73.0515, 41.5582], + [72.6734, 41.7658]] + ] + } +} +``` +{% include copy-curl.html %} + +Index a polygon (triangle) with a triangular hole in WKT format: + +```json +PUT testindex/_doc/4 +{ + "location" : "POLYGON ((40.7128 74.0060, 42.3601 71.0589, 42.6526 73.7562, 40.7128 74.0060), (41.7658 72.6734, 41.5623 72.6506, 41.5582 73.0515, 41.7658 72.6734))" +} +``` +{% include copy-curl.html %} + +In OpenSearch, you can specify a polygon by listing its vertices clockwise or counterclockwise. This works well for polygons that do not cross the date line (are narrower than 180°). However, a polygon that crosses the date line (is wider than 180°) might be ambiguous because WKT does not impose a specific order on vertices. Thus, you must specify polygons that cross the date line by listing their vertices counterclockwise. + +You can define an [`orientation`](#parameters) parameter to specify the vertex traversal order at mapping time: + +```json +PUT testindex +{ + "mappings": { + "properties": { + "location": { + "type": "geo_shape", + "orientation" : "left" + } + } + } +} +``` +{% include copy-curl.html %} + +Subsequently indexed documents can override the `orientation` setting: + +```json +PUT testindex/_doc/3 +{ + "location" : { + "type" : "polygon", + "orientation" : "cw", + "coordinates" : [ + [[74.0060, 40.7128], + [71.0589, 42.3601], + [73.7562, 42.6526], + [74.0060, 40.7128]] + ] + } +} +``` +{% include copy-curl.html %} + +## Multipoint + +A multipoint is an array of discrete related points that are not connected. + +Index a multipoint in GeoJSON format: + +```json +PUT testindex/_doc/6 +{ + "location" : { + "type" : "multipoint", + "coordinates" : [ + [74.0060, 40.7128], + [71.0589, 42.3601] + ] + } +} +``` +{% include copy-curl.html %} + +Index a multipoint in WKT format: + +```json +PUT testindex/_doc/6 +{ + "location" : "MULTIPOINT (74.0060 40.7128, 71.0589 42.3601)" +} +``` +{% include copy-curl.html %} + +## Multilinestring + +A multilinestring is an array of linestrings. + +Index a linestring in GeoJSON format: + +```json +PUT testindex/_doc/2 +{ + "location" : { + "type" : "multilinestring", + "coordinates" : [ + [[74.0060, 40.7128], [71.0589, 42.3601]], + [[73.7562, 42.6526], [72.6734, 41.7658]] + ] + } +} +``` +{% include copy-curl.html %} + +Index a linestring in WKT format: + +```json +PUT testindex/_doc/2 +{ + "location" : "MULTILINESTRING ((74.0060 40.7128, 71.0589 42.3601), (73.7562 42.6526, 72.6734 41.7658))" +} +``` +{% include copy-curl.html %} + +## Multipolygon + +A multipolygon is an array of polygons. In this example, the first polygon contains a hole, and the second does not. + +Index a multipolygon in GeoJSON format: + +```json +PUT testindex/_doc/4 +{ + "location" : { + "type" : "multipolygon", + "coordinates" : [ + [ + [[74.0060, 40.7128], + [71.0589, 42.3601], + [73.7562, 42.6526], + [74.0060, 40.7128]], + + [[72.6734,41.7658], + [72.6506, 41.5623], + [73.0515, 41.5582], + [72.6734, 41.7658]] + ], + [ + [[73.9776, 40.7614], + [73.9554, 40.7827], + [73.9631, 40.7812], + [73.9776, 40.7614]] + ] + ] + } +} +``` +{% include copy-curl.html %} + +Index a multipolygon in WKT format: + +```json +PUT testindex/_doc/4 +{ + "location" : "MULTIPOLYGON (((40.7128 74.0060, 42.3601 71.0589, 42.6526 73.7562, 40.7128 74.0060), (41.7658 72.6734, 41.5623 72.6506, 41.5582 73.0515, 41.7658 72.6734)), ((73.9776 40.7614, 73.9554 40.7827, 73.9631 40.7812, 73.9776 40.7614)))" +} +``` +{% include copy-curl.html %} + +## Geometry collection + +A geometry collection is a collection of geoshapes that may be of different types. + +Index a geometry collection in GeoJSON format: + +```json +PUT testindex/_doc/7 +{ + "location" : { + "type": "geometrycollection", + "geometries": [ + { + "type": "point", + "coordinates": [74.0060, 40.7128] + }, + { + "type": "linestring", + "coordinates": [[73.7562, 42.6526], [72.6734, 41.7658]] + } + ] + } +} +``` +{% include copy-curl.html %} + +Index a geometry collection in WKT format: + +```json +PUT testindex/_doc/7 +{ + "location" : "GEOMETRYCOLLECTION (POINT (74.0060 40.7128), LINESTRING(73.7562 42.6526, 72.6734 41.7658))" +} +``` +{% include copy-curl.html %} + +## Envelope + +An envelope is a bounding rectangle specified by upper-left and lower-right vertices. The GeoJSON format is `[[minLon, maxLat], [maxLon, minLat]]`. + +Index an envelope in GeoJSON format: + +```json +PUT testindex/_doc/2 +{ + "location" : { + "type" : "envelope", + "coordinates" : [[71.0589, 42.3601], [74.0060, 40.7128]] + } +} +``` +{% include copy-curl.html %} + +In WKT format, use `BBOX (minLon, maxLon, maxLat, minLat)`. + +Index an envelope in WKT BBOX format: + +```json +PUT testindex/_doc/8 +{ + "location" : "BBOX (71.0589, 74.0060, 42.3601, 40.7128)" +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by geoshape field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`coerce` | A Boolean value that specifies whether to automatically close unclosed linear rings. Default is `false`. +`ignore_malformed` | A Boolean value that specifies to ignore malformed GeoJSON or WKT geoshapes and not to throw an exception. Default is `false` (throw an exception when geoshapes are malformed). +`ignore_z_value` | Specific to points with three coordinates. If `ignore_z_value` is `true`, the third coordinate is not indexed but is still stored in the _source field. If `ignore_z_value` is `false`, an exception is thrown. Default is `true`. +`orientation` | Specifies the traversal order of the vertices in the geoshape's list of coordinates. `orientation` takes the following values:
1. RIGHT: counterclockwise. Specify RIGHT orientation by using one of the following strings (uppercase or lowercase): `right`, `counterclockwise`, `ccw`.
2. LEFT: clockwise. Specify LEFT orientation by using one of the following strings (uppercase or lowercase): `left`, `clockwise`, `cw`. This value can be overridden by individual documents.
Default is `RIGHT`. diff --git a/_field-types/supported-field-types/geographic.md b/_field-types/supported-field-types/geographic.md new file mode 100644 index 00000000..cbe3982a --- /dev/null +++ b/_field-types/supported-field-types/geographic.md @@ -0,0 +1,20 @@ +--- +layout: default +title: Geographic field types +nav_order: 55 +has_children: true +has_toc: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/geographic/ + - /field-types/geographic/ +--- + +# Geographic field types + +Geographic fields contain values that represent points or shapes on a map. The following table lists all geographic field types that OpenSearch supports. + +Field data type | Description +:--- | :--- +[`geo_point`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point/) | A geographic point specified by latitude and longitude. +[`geo_shape`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-shape/) | A geographic shape, such as a polygon or a collection of geographic points. diff --git a/_field-types/supported-field-types/index.md b/_field-types/supported-field-types/index.md new file mode 100644 index 00000000..69ca0032 --- /dev/null +++ b/_field-types/supported-field-types/index.md @@ -0,0 +1,190 @@ +--- +layout: default +title: Supported field types +nav_order: 80 +has_children: true +has_toc: false +redirect_from: + - /opensearch/supported-field-types/ + - /opensearch/supported-field-types/index/ +--- + +# Supported field types + +You can specify data types for your fields when creating a mapping. The following table lists all data field types that OpenSearch supports. + +Category | Field types and descriptions +:--- | :--- +Alias | [`alias`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/alias/): An additional name for an existing field. +Binary | [`binary`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/binary/): A binary value in Base64 encoding. +[Numeric]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/numeric/) | A numeric value (`byte`, `double`, `float`, `half_float`, `integer`, `long`, [`unsigned_long`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/unsigned-long/), `scaled_float`, `short`). +Boolean | [`boolean`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/boolean/): A Boolean value. +[Date]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/dates/)| [`date`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/): A date stored in milliseconds.
[`date_nanos`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date-nanos/): A date stored in nanoseconds. +IP | [`ip`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/ip/): An IP address in IPv4 or IPv6 format. +[Range]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/range/) | A range of values (`integer_range`, `long_range`, `double_range`, `float_range`, `date_range`, `ip_range`). +[Object]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object-fields/)| [`object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object/): A JSON object.
[`nested`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/): Used when objects in an array need to be indexed independently as separate documents.
[`flat_object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/flat-object/): A JSON object treated as a string.
[`join`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/): Establishes a parent-child relationship between documents in the same index. +[String]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/string/)|[`keyword`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/keyword/): Contains a string that is not analyzed.
[`text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/): Contains a string that is analyzed.
[`match_only_text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/match-only-text/): A space-optimized version of a `text` field.
[`token_count`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/token-count/): Stores the number of analyzed tokens in a string. +[Autocomplete]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/autocomplete/) |[`completion`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/completion/): Provides autocomplete functionality through a completion suggester.
[`search_as_you_type`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/search-as-you-type/): Provides search-as-you-type functionality using both prefix and infix completion. +[Geographic]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geographic/)| [`geo_point`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-point/): A geographic point.
[`geo_shape`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-shape/): A geographic shape. +[Rank]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/rank/) | Boosts or decreases the relevance score of documents (`rank_feature`, `rank_features`). +k-NN vector | [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/): Allows indexing a k-NN vector into OpenSearch and performing different kinds of k-NN search. +Percolator | [`percolator`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/percolator/): Specifies to treat this field as a query. + +## Arrays + +There is no dedicated array field type in OpenSearch. Instead, you can pass an array of values into any field. All values in the array must have the same field type. + +```json +PUT testindex1/_doc/1 +{ + "number": 1 +} + +PUT testindex1/_doc/2 +{ + "number": [1, 2, 3] +} +``` + +## Multifields + +Multifields are used to index the same field differently. Strings are often mapped as `text` for full-text queries and `keyword` for exact-value queries. + +Multifields can be created using the `fields` parameter. For example, you can map a book `title` to be of type `text` and keep a `title.raw` subfield of type `keyword`. + +```json +PUT books +{ + "mappings" : { + "properties" : { + "title" : { + "type" : "text", + "fields" : { + "raw" : { + "type" : "keyword" + } + } + } + } + } +} +``` + +## Null value + +Setting a field's value to `null`, an empty array, or an array of `null` values makes this field equivalent to an empty field. Therefore, you cannot search for documents that have `null` in this field. + +To make a field searchable for `null` values, you can specify its `null_value` parameter in the index's mappings. Then, all `null` values passed to this field will be replaced with the specified `null_value`. + +The `null_value` parameter must be of the same type as the field. For example, if your field is a string, the `null_value` for this field must also be a string. +{: .note} + +### Example + +Create a mapping to replace `null` values in the `emergency_phone` field with the string "NONE": + +```json +PUT testindex +{ + "mappings": { + "properties": { + "name": { + "type": "keyword" + }, + "emergency_phone": { + "type": "keyword", + "null_value": "NONE" + } + } + } +} +``` + +Index three documents into testindex. The `emergency_phone` fields of documents 1 and 3 contain `null`, while the `emergency_phone` field of document 2 has an empty array: + +```json +PUT testindex/_doc/1 +{ + "name": "Akua Mansa", + "emergency_phone": null +} +``` + +```json +PUT testindex/_doc/2 +{ + "name": "Diego Ramirez", + "emergency_phone" : [] +} +``` + +```json +PUT testindex/_doc/3 +{ + "name": "Jane Doe", + "emergency_phone": [null, null] +} +``` + +Search for people who do not have an emergency phone: + +```json +GET testindex/_search +{ + "query": { + "term": { + "emergency_phone": "NONE" + } + } +} +``` + +The response contains documents 1 and 3 but not document 2 because only explicit `null` values are replaced with the string "NONE": + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 0.18232156, + "hits" : [ + { + "_index" : "testindex", + "_type" : "_doc", + "_id" : "1", + "_score" : 0.18232156, + "_source" : { + "name" : "Akua Mansa", + "emergency_phone" : null + } + }, + { + "_index" : "testindex", + "_type" : "_doc", + "_id" : "3", + "_score" : 0.18232156, + "_source" : { + "name" : "Jane Doe", + "emergency_phone" : [ + null, + null + ] + } + } + ] + } +} +``` + +The `_source` field still contains explicit `null` values because it is not affected by the `null_value`. +{: .note} diff --git a/_field-types/supported-field-types/ip.md b/_field-types/supported-field-types/ip.md new file mode 100644 index 00000000..cb2a5569 --- /dev/null +++ b/_field-types/supported-field-types/ip.md @@ -0,0 +1,120 @@ +--- +layout: default +title: IP address +nav_order: 30 +has_children: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/ip/ + - /field-types/ip/ +--- + +# IP address field type + +An ip field type contains an IP address in IPv4 or IPv6 format. + +To represent IP address ranges, there is an IP [range field type]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/range/). +{: .note } + +## Example + +Create a mapping with an IP address: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "ip_address" : { + "type" : "ip" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with an IP address: + +```json +PUT testindex/_doc/1 +{ + "ip_address" : "10.24.34.0" +} +``` +{% include copy-curl.html %} + +Query an index for a specific IP address: + +```json +GET testindex/_doc/1 +{ + "query": { + "term": { + "ip_address": "10.24.34.0" + } + } +} +``` +{% include copy-curl.html %} + +## Searching for an IP address and its associated network mask + +You can query an index for an IP address in [Classless Inter-Domain Routing (CIDR) notation](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing#CIDR_notation). Using CIDR notation, specify the IP address and the prefix length (0–32), separated by `/`. For example, the prefix length of 24 will match all IP addresses with the same initial 24 bits. + +#### Example query in IPv4 format + +```json +GET testindex/_search +{ + "query": { + "term": { + "ip_address": "10.24.34.0/24" + } + } +} +``` +{% include copy-curl.html %} + +#### Example query in IPv6 format + +```json +GET testindex/_search +{ + "query": { + "term": { + "ip_address": "2001:DB8::/24" + } + } +} +``` +{% include copy-curl.html %} + +If you use an IP address in IPv6 format in a `query_string` query, you need to escape `:` characters because they are parsed as special characters. You can accomplish this by wrapping the IP address in quotation marks and escaping those quotation marks with `\`. + +```json +GET testindex/_search +{ + "query" : { + "query_string": { + "query": "ip_address:\"2001:DB8::/24\"" + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by ip field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`boost` | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field's relevance. Values between 0.0 and 1.0 decrease the field's relevance. Default is 1.0. +`doc_values` | A Boolean value that specifies if the field should be stored on disk so that it can be used for aggregations, sorting, or scripting. Default is `true`. +`ignore_malformed` | A Boolean value that specifies to ignore malformed values and not to throw an exception. Default is `false`. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. +[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. Must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. +`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the _source field. Default is `false`. + + diff --git a/_field-types/supported-field-types/join.md b/_field-types/supported-field-types/join.md new file mode 100644 index 00000000..c83705f4 --- /dev/null +++ b/_field-types/supported-field-types/join.md @@ -0,0 +1,327 @@ +--- +layout: default +title: Join +nav_order: 44 +has_children: false +parent: Object field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/join/ + - /field-types/join/ +--- + +# Join field type + +A join field type establishes a parent/child relationship between documents in the same index. + +## Example + +Create a mapping to establish a parent-child relationship between products and their brands: + +```json +PUT testindex1 +{ + "mappings": { + "properties": { + "product_to_brand": { + "type": "join", + "relations": { + "brand": "product" + } + } + } + } +} +``` +{% include copy-curl.html %} + +Then, index a parent document with a join field type: + +```json +PUT testindex1/_doc/1 +{ + "name": "Brand 1", + "product_to_brand": { + "name": "brand" + } +} +``` +{% include copy-curl.html %} + +You can also use a shortcut without object notation to index a parent document: + +```json +PUT testindex1/_doc/1 +{ + "name": "Brand 1", + "product_to_brand" : "brand" +} +``` +{% include copy-curl.html %} + +When indexing child documents, you have to specify the `routing` query parameter because parent and child documents in the same relation have to be indexed on the same shard. Each child document refers to its parent's ID in the `parent` field. + +Index two child documents, one for each parent: + +```json +PUT testindex1/_doc/3?routing=1 +{ + "name": "Product 1", + "product_to_brand": { + "name": "product", + "parent": "1" + } +} +``` +{% include copy-curl.html %} + +```json +PUT testindex1/_doc/4?routing=1 +{ + "name": "Product 2", + "product_to_brand": { + "name": "product", + "parent": "1" + } +} +``` +{% include copy-curl.html %} + +## Querying a join field + +When you query a join field, the response contains subfields that specify whether the returned document is a parent or a child. For child objects, the parent ID is also returned. + +### Search for all documents + +```json +GET testindex1/_search +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} + +The response indicates whether a document is a parent or a child: + +```json +{ + "took" : 4, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "testindex1", + "_type" : "_doc", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "name" : "Brand 1", + "product_to_brand" : { + "name" : "brand" + } + } + }, + { + "_index" : "testindex1", + "_type" : "_doc", + "_id" : "3", + "_score" : 1.0, + "_routing" : "1", + "_source" : { + "name" : "Product 1", + "product_to_brand" : { + "name" : "product", + "parent" : "1" + } + } + }, + { + "_index" : "testindex1", + "_type" : "_doc", + "_id" : "4", + "_score" : 1.0, + "_routing" : "1", + "_source" : { + "name" : "Product 2", + "product_to_brand" : { + "name" : "product", + "parent" : "1" + } + } + } + ] + } +} +``` + +### Search for all children of a parent + +Find all products associated with Brand 1: + +```json +GET testindex1/_search +{ + "query" : { + "has_parent": { + "parent_type":"brand", + "query": { + "match" : { + "name": "Brand 1" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains Product 1 and Product 2, which are associated with Brand 1: + +```json +{ + "took" : 7, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "testindex1", + "_type" : "_doc", + "_id" : "3", + "_score" : 1.0, + "_routing" : "1", + "_source" : { + "name" : "Product 1", + "product_to_brand" : { + "name" : "product", + "parent" : "1" + } + } + }, + { + "_index" : "testindex1", + "_type" : "_doc", + "_id" : "4", + "_score" : 1.0, + "_routing" : "1", + "_source" : { + "name" : "Product 2", + "product_to_brand" : { + "name" : "product", + "parent" : "1" + } + } + } + ] + } +} +``` + +### Search for the parent of a child + +Find the parent of Product 1: + +```json +GET testindex1/_search +{ + "query" : { + "has_child": { + "type":"product", + "query": { + "match" : { + "name": "Product 1" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response returns Brand 1 as Product 1's parent: + +```json +{ + "took" : 4, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "testindex1", + "_type" : "_doc", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "name" : "Brand 1", + "product_to_brand" : { + "name" : "brand" + } + } + } + ] + } +} +``` + +## Parent with many children + +One parent can have many children. Create a mapping with multiple children: + +```json +PUT testindex1 +{ + "mappings": { + "properties": { + "parent_to_child": { + "type": "join", + "relations": { + "parent": ["child 1", "child 2"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Join field type notes + +- There can only be one join field mapping in an index. +- You need to provide the routing parameter when retrieving, updating, or deleting a child document. This is because parent and child documents in the same relation have to be indexed on the same shard. +- Multiple parents are not supported. +- You can add a child document to an existing document only if the existing document is already marked as a parent. +- You can add a new relation to an existing join field. diff --git a/_field-types/supported-field-types/keyword.md b/_field-types/supported-field-types/keyword.md new file mode 100644 index 00000000..eea6cc66 --- /dev/null +++ b/_field-types/supported-field-types/keyword.md @@ -0,0 +1,61 @@ +--- +layout: default +title: Keyword +nav_order: 46 +has_children: false +parent: String field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/keyword/ + - /field-types/keyword/ +--- + +# Keyword field type + +A keyword field type contains a string that is not analyzed. It allows only exact, case-sensitive matches. + +By default, keyword fields are both indexed (because `index` is enabled) and stored on disk (because `doc_values` is enabled). To reduce disk space, you can specify not to index keyword fields by setting `index` to `false`. + +If you need to use a field for full-text search, map it as [`text`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/text/) instead. +{: .note } + +## Example + +The following query creates a mapping with a keyword field. Setting `index` to `false` specifies to store the `genre` field on disk and to retrieve it using `doc_values`: + +```json +PUT movies +{ + "mappings" : { + "properties" : { + "genre" : { + "type" : "keyword", + "index" : false + } + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by keyword field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`boost` | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field's relevance. Values between 0.0 and 1.0 decrease the field's relevance. Default is 1.0. +`doc_values` | A Boolean value that specifies whether the field should be stored on disk so that it can be used for aggregations, sorting, or scripting. Default is `true`. +`eager_global_ordinals` | Specifies whether global ordinals should be loaded eagerly on refresh. If the field is often used for aggregations, this parameter should be set to `true`. Default is `false`. +`fields` | To index the same string in several ways (for example, as a keyword and text), provide the fields parameter. You can specify one version of the field to be used for search and another to be used for sorting and aggregations. +`ignore_above` | Any string longer than this integer value should not be indexed. Default is 2147483647. Default dynamic mapping creates a keyword subfield for which `ignore_above` is set to 256. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. To reduce disk space, set `index` to `false`. +`index_options` | Information to be stored in the index that will be considered when calculating relevance scores. Can be set to `freqs` for term frequency. Default is `docs`. +`meta` | Accepts metadata for this field. +[`normalizer`]({{site.url}}{{site.baseurl}}/analyzers/normalizers/) | Specifies how to preprocess this field before indexing (for example, make it lowercase). Default is `null` (no preprocessing). +`norms` | A Boolean value that specifies whether the field length should be used when calculating relevance scores. Default is `false`. +[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. Must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. +`similarity` | The ranking algorithm for calculating relevance scores. Default is `BM25`. +`split_queries_on_whitespace` | A Boolean value that specifies whether full-text queries should be split on white space. Default is `false`. +`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the `_source` field. Default is `false`. + diff --git a/_field-types/supported-field-types/knn-vector.md b/_field-types/supported-field-types/knn-vector.md new file mode 100644 index 00000000..741a86da --- /dev/null +++ b/_field-types/supported-field-types/knn-vector.md @@ -0,0 +1,269 @@ +--- +layout: default +title: k-NN vector +nav_order: 58 +has_children: false +parent: Supported field types +has_math: true +--- + +# k-NN vector field type + +The [k-NN plugin]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/) introduces a custom data type, the `knn_vector`, that allows users to ingest their k-NN vectors into an OpenSearch index and perform different kinds of k-NN search. The `knn_vector` field is highly configurable and can serve many different k-NN workloads. In general, a `knn_vector` field can be built either by providing a method definition or specifying a model id. + +## Example + +For example, to map `my_vector1` as a `knn_vector`, use the following request: + +```json +PUT test-index +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 3, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "lucene", + "parameters": { + "ef_construction": 128, + "m": 24 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Method definitions + +[Method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions) are used when the underlying [approximate k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/) algorithm does not require training. For example, the following `knn_vector` field specifies that *nmslib*'s implementation of *hnsw* should be used for approximate k-NN search. During indexing, *nmslib* will build the corresponding *hnsw* segment files. + +```json +"my_vector": { + "type": "knn_vector", + "dimension": 4, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "nmslib", + "parameters": { + "ef_construction": 128, + "m": 24 + } + } +} +``` + +## Model IDs + +Model IDs are used when the underlying Approximate k-NN algorithm requires a training step. As a prerequisite, the +model has to be created with the [Train API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#train-model). The +model contains the information needed to initialize the native library segment files. + +```json + "type": "knn_vector", + "model_id": "my-model" +} +``` + +However, if you intend to use Painless scripting or a k-NN score script, you only need to pass the dimension. + ```json + "type": "knn_vector", + "dimension": 128 + } + ``` + +## Lucene byte vector + +By default, k-NN vectors are `float` vectors, where each dimension is 4 bytes. If you want to save storage space, you can use `byte` vectors with the `lucene` engine. In a `byte` vector, each dimension is a signed 8-bit integer in the [-128, 127] range. + +Byte vectors are supported only for the `lucene` engine. They are not supported for the `nmslib` and `faiss` engines. +{: .note} + +In [k-NN benchmarking tests](https://github.com/opensearch-project/k-NN/tree/main/benchmarks/perf-tool), the use of `byte` rather than `float` vectors resulted in a significant reduction in storage and memory usage as well as improved indexing throughput and reduced query latency. Additionally, precision on recall was not greatly affected (note that recall can depend on various factors, such as the [quantization technique](#quantization-techniques) and data distribution). + +When using `byte` vectors, expect some loss of precision in the recall compared to using `float` vectors. Byte vectors are useful in large-scale applications and use cases that prioritize a reduced memory footprint in exchange for a minimal loss of recall. +{: .important} + +Introduced in k-NN plugin version 2.9, the optional `data_type` parameter defines the data type of a vector. The default value of this parameter is `float`. + +To use a `byte` vector, set the `data_type` parameter to `byte` when creating mappings for an index: + + ```json +PUT test-index +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 3, + "data_type": "byte", + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "lucene", + "parameters": { + "ef_construction": 128, + "m": 24 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Then ingest documents as usual. Make sure each dimension in the vector is in the supported [-128, 127] range: + +```json +PUT test-index/_doc/1 +{ + "my_vector1": [-126, 28, 127] +} +``` +{% include copy-curl.html %} + +```json +PUT test-index/_doc/2 +{ + "my_vector1": [100, -128, 0] +} +``` +{% include copy-curl.html %} + +When querying, be sure to use a `byte` vector: + +```json +GET test-index/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector1": { + "vector": [26, -120, 99], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +### Quantization techniques + +If your vectors are of the type `float`, you need to first convert them to the `byte` type before ingesting the documents. This conversion is accomplished by _quantizing the dataset_---reducing the precision of its vectors. There are many quantization techniques, such as scalar quantization or product quantization (PQ), which is used in the Faiss engine. The choice of quantization technique depends on the type of data you're using and can affect the accuracy of recall values. The following sections describe the scalar quantization algorithms that were used to quantize the [k-NN benchmarking test](https://github.com/opensearch-project/k-NN/tree/main/benchmarks/perf-tool) data for the [L2](#scalar-quantization-for-the-l2-space-type) and [cosine similarity](#scalar-quantization-for-the-cosine-similarity-space-type) space types. The provided pseudocode is for illustration purposes only. + +#### Scalar quantization for the L2 space type + +The following example pseudocode illustrates the scalar quantization technique used for the benchmarking tests on Euclidean datasets with the L2 space type. Euclidean distance is shift invariant. If you shift both $$x$$ and $$y$$ by the same $$z$$, then the distance remains the same ($$\lVert x-y\rVert =\lVert (x-z)-(y-z)\rVert$$). + +```python +# Random dataset (Example to create a random dataset) +dataset = np.random.uniform(-300, 300, (100, 10)) +# Random query set (Example to create a random queryset) +queryset = np.random.uniform(-350, 350, (100, 10)) +# Number of values +B = 256 + +# INDEXING: +# Get min and max +dataset_min = np.min(dataset) +dataset_max = np.max(dataset) +# Shift coordinates to be non-negative +dataset -= dataset_min +# Normalize into [0, 1] +dataset *= 1. / (dataset_max - dataset_min) +# Bucket into 256 values +dataset = np.floor(dataset * (B - 1)) - int(B / 2) + +# QUERYING: +# Clip (if queryset range is out of datset range) +queryset = queryset.clip(dataset_min, dataset_max) +# Shift coordinates to be non-negative +queryset -= dataset_min +# Normalize +queryset *= 1. / (dataset_max - dataset_min) +# Bucket into 256 values +queryset = np.floor(queryset * (B - 1)) - int(B / 2) +``` +{% include copy.html %} + +#### Scalar quantization for the cosine similarity space type + +The following example pseudocode illustrates the scalar quantization technique used for the benchmarking tests on angular datasets with the cosine similarity space type. Cosine similarity is not shift invariant ($$cos(x, y) \neq cos(x-z, y-z)$$). + +The following pseudocode is for positive numbers: + +```python +# For Positive Numbers + +# INDEXING and QUERYING: + +# Get Max of train dataset +max = np.max(dataset) +min = 0 +B = 127 + +# Normalize into [0,1] +val = (val - min) / (max - min) +val = (val * B) + +# Get int and fraction values +int_part = floor(val) +frac_part = val - int_part + +if 0.5 < frac_part: + bval = int_part + 1 +else: + bval = int_part + +return Byte(bval) +``` +{% include copy.html %} + +The following pseudocode is for negative numbers: + +```python +# For Negative Numbers + +# INDEXING and QUERYING: + +# Get Min of train dataset +min = 0 +max = -np.min(dataset) +B = 128 + +# Normalize into [0,1] +val = (val - min) / (max - min) +val = (val * B) + +# Get int and fraction values +int_part = floor(var) +frac_part = val - int_part + +if 0.5 < frac_part: + bval = int_part + 1 +else: + bval = int_part + +return Byte(bval) +``` +{% include copy.html %} diff --git a/_field-types/supported-field-types/match-only-text.md b/_field-types/supported-field-types/match-only-text.md new file mode 100644 index 00000000..fd2c6b58 --- /dev/null +++ b/_field-types/supported-field-types/match-only-text.md @@ -0,0 +1,101 @@ +--- +layout: default +title: Match-only text +nav_order: 61 +has_children: false +parent: String field types +grand_parent: Supported field types +--- + +# Match-only text field type + +A `match_only_text` field is a variant of a `text` field designed for full-text search when scoring and positional information of terms within a document are not critical. + +A `match_only_text` field is different from a `text` field in the following ways: + + - Omits storing positions, frequencies, and norms, reducing storage requirements. + - Disables scoring so that all matching documents receive a constant score of 1.0. + - Supports all query types except interval and span queries. + +Choose the `match_only_text` field type to prioritize efficient full-text search over complex ranking and positional queries while optimizing storage costs. Using `match_only_text` creates significantly smaller indexes, which results in lower storage costs, especially for large datasets. + +Use a `match_only_text` field when you need to quickly find documents containing specific terms without the overhead of storing frequencies and positions. The `match_only_text` field type is not the best choice for ranking results based on relevance or for queries that rely on term proximity or order, like interval or span queries. While this field type does support phrase queries, their performance isn't as efficient as when using the `text` field type. If identifying exact phrases or their locations within documents is essential, use the `text` field type instead. + +## Example + +Create a mapping with a `match_only_text` field: + +```json +PUT movies +{ + "mappings" : { + "properties" : { + "title" : { + "type" : "match_only_text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +While `match_only_text` supports most parameters available for `text` fields, modifying most of them can be counterproductive. This field type is intended to be simple and efficient, minimizing data stored in the index to optimize storage costs. Therefore, keeping the default settings is generally the best approach. Any modifications beyond analyzer settings can reintroduce overhead and negate the efficiency benefits of `match_only_text`. + +The following table lists all parameters available for `match_text_only` fields. + +Parameter | Description +:--- | :--- +`analyzer` | The analyzer to be used for the field. By default, it will be used at index time and at search time. To override it at search time, set the `search_analyzer` parameter. Default is the `standard` analyzer, which uses grammar-based tokenization and is based on the [Unicode Text Segmentation](https://unicode.org/reports/tr29/) algorithm. +`boost` | All hits are assigned a score of 1 and are multiplied by `boost` to produce the final score for the query clause. +`eager_global_ordinals` | Specifies whether global ordinals should be loaded eagerly on refresh. If the field is often used for aggregations, this parameter should be set to `true`. Default is `false`. +`fielddata` | A Boolean value that specifies whether to access analyzed tokens for sorting, aggregation, and scripting. Default is `false`. +`fielddata_frequency_filter` | A JSON object specifying that only those analyzed tokens whose document frequency is between the `min` and `max` values (provided as either an absolute number or a percentage) should be loaded into memory. Frequency is computed per segment. Parameters: `min`, `max`, `min_segment_size`. Default is to load all analyzed tokens. +`fields` | To index the same string in several ways (for example, as a keyword and text), provide the `fields` parameter. You can specify one version of the field to be used for search and another to be used for sorting and aggregation. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. +`index_options` | You cannot modify this parameter. +`index_phrases` | Not supported. +`index_prefixes` | Not supported. +`meta` | Accepts metadata for this field. +`norms` | Norms are disabled and cannot be enabled. +`position_increment_gap` | Although positions are disabled, `position_increment_gap` behaves similarly to the `text` field when used in phrase queries. Such queries may be slower but are still functional. +`similarity` | Setting similarity has no impact. The `match_only_text` field type doesn't support queries like `more_like_this`, which rely on similarity. Use a `keyword` or `text` field for queries that rely on similarity. +`term_vector` | Term vectors are supported, but using them is discouraged because it contradicts the primary purpose of this field---storage optimization. + +## Migrating a field from `text` to `match_only_text` + +You can use the [Reindex API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/reindex/) to migrate from a `text` field to `match_only_text` by updating the correct mapping in the destination index. + +In the following example, the `source` index contains a `title` field of type `text`. + +Create a destination index with the `title` field mapped as `text`: + +```json +PUT destination +{ + "mappings" : { + "properties" : { + "title" : { + "type" : "match_only_text" + } + } + } +} +``` +{% include copy-curl.html %} + +Reindex the data: + +```json +POST _reindex +{ + "source": { + "index":"source" + }, + "dest": { + "index":"destination" + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/supported-field-types/nested.md b/_field-types/supported-field-types/nested.md new file mode 100644 index 00000000..d61ccd53 --- /dev/null +++ b/_field-types/supported-field-types/nested.md @@ -0,0 +1,314 @@ +--- +layout: default +title: Nested +nav_order: 42 +has_children: false +parent: Object field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/nested/ + - /field-types/nested/ +--- + +# Nested field type + +A nested field type is a special type of [object field type]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/object/). + +Any object field can take an array of objects. Each of the objects in the array is dynamically mapped as an object field type and stored in flattened form. This means that the objects in the array are broken down into individual fields, and values for each field across all objects are stored together. It is sometimes necessary to use the nested type to preserve a nested object as a whole so that you can perform a search on it. + +## Flattened form + +By default, each of the nested objects is dynamically mapped as object field type. Any object field can take an array of objects. + +```json +PUT testindex1/_doc/100 +{ + "patients": [ + {"name" : "John Doe", "age" : 56, "smoker" : true}, + {"name" : "Mary Major", "age" : 85, "smoker" : false} + ] +} +``` +{% include copy-curl.html %} + +When these objects are stored, they are flattened, so their internal representation has an array of all values for each field: + +```json +{ + "patients.name" : ["John Doe", "Mary Major"], + "patients.age" : [56, 85], + "patients.smoker" : [true, false] +} +``` + +Some queries will work correctly in this representation. If you search for patients older than 75 OR smokers, document 100 should match. + +```json +GET testindex1/_search +{ + "query": { + "bool": { + "should": [ + { + "term": { + "patients.smoker": true + } + }, + { + "range": { + "patients.age": { + "gte": 75 + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The query correctly returns document 100: + +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 1.3616575, + "hits" : [ + { + "_index" : "testindex1", + "_type" : "_doc", + "_id" : "100", + "_score" : 1.3616575, + "_source" : { + "patients" : [ + { + "name" : "John Doe", + "age" : "56", + "smoker" : true + }, + { + "name" : "Mary Major", + "age" : "85", + "smoker" : false + } + ] + } + } + ] + } +} +``` + +Alternatively, if you search for patients older than 75 AND smokers, document 100 should not match. + +```json +GET testindex1/_search +{ + "query": { + "bool": { + "must": [ + { + "term": { + "patients.smoker": true + } + }, + { + "range": { + "patients.age": { + "gte": 75 + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +However, this query still incorrectly returns document 100. This is because the relation between age and smoking was lost when arrays of values for individual fields were created. + +## Nested field type + +Nested objects are stored as separate documents, and the parent object has references to its children. To mark objects as nested, create a mapping with a nested field type. + +```json +PUT testindex1 +{ + "mappings" : { + "properties": { + "patients": { + "type" : "nested" + } + } + } +} +``` +{% include copy-curl.html %} + +Then, index a document with a nested field type: + +```json +PUT testindex1/_doc/100 +{ + "patients": [ + {"name" : "John Doe", "age" : 56, "smoker" : true}, + {"name" : "Mary Major", "age" : 85, "smoker" : false} + ] +} +``` +{% include copy-curl.html %} + +You can use the following nested query to search for patients older than 75 OR smokers: + +```json +GET testindex1/_search +{ + "query": { + "nested": { + "path": "patients", + "query": { + "bool": { + "should": [ + { + "term": { + "patients.smoker": true + } + }, + { + "range": { + "patients.age": { + "gte": 75 + } + } + } + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query correctly returns both patients: + +```json +{ + "took" : 7, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 0.8465736, + "hits" : [ + { + "_index" : "testindex1", + "_id" : "100", + "_score" : 0.8465736, + "_source" : { + "patients" : [ + { + "name" : "John Doe", + "age" : 56, + "smoker" : true + }, + { + "name" : "Mary Major", + "age" : 85, + "smoker" : false + } + ] + } + } + ] + } +} +``` + +You can use the following nested query to search for patients older than 75 AND smokers: + +```json +GET testindex1/_search +{ + "query": { + "nested": { + "path": "patients", + "query": { + "bool": { + "must": [ + { + "term": { + "patients.smoker": true + } + }, + { + "range": { + "patients.age": { + "gte": 75 + } + } + } + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +The previous query returns no results, as expected: + +```json +{ + "took" : 7, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + } +} +``` + +## Parameters + +The following table lists the parameters accepted by object field types. All parameters are optional. + +Parameter | Description +:--- | :--- +[`dynamic`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/object#the-dynamic-parameter) | Specifies whether new fields can be dynamically added to this object. Valid values are `true`, `false`, and `strict`. Default is `true`. +`include_in_parent` | A Boolean value that specifies whether all fields in the child nested object should also be added to the parent document in flattened form. Default is `false`. +`include_in_root` | A Boolean value that specifies whether all fields in the child nested object should also be added to the root document in flattened form. Default is `false`. +`properties` | Fields of this object, which can be of any supported type. New properties can be dynamically added to this object if `dynamic` is set to `true`. diff --git a/_field-types/supported-field-types/numeric.md b/_field-types/supported-field-types/numeric.md new file mode 100644 index 00000000..9f56d6a9 --- /dev/null +++ b/_field-types/supported-field-types/numeric.md @@ -0,0 +1,119 @@ +--- +layout: default +title: Numeric field types +parent: Supported field types +nav_order: 15 +has_children: true +redirect_from: + - /opensearch/supported-field-types/numeric/ + - /field-types/numeric/ +--- + +# Numeric field types + +The following table lists all numeric field types that OpenSearch supports. + +Field data type | Description +:--- | :--- +`byte` | A signed 8-bit integer. Minimum is −128. Maximum is 127. +`double` | A double-precision 64-bit IEEE 754 floating-point value. Minimum magnitude is 2−1074 . Maximum magnitude is (2 − 2−52) · 21023. The number of significant bits is 53. The number of significant digits is 15.95. +`float` | A single-precision 32-bit IEEE 754 floating-point value. Minimum magnitude is 2−149 . Maximum magnitude is (2 − 2−23) · 2127. The number of significant bits is 24. The number of significant digits is 7.22. +`half_float` | A half-precision 16-bit IEEE 754 floating-point value. Minimum magnitude is 2−24 . Maximum magnitude is 65504. The number of significant bits is 11. The number of significant digits is 3.31. +`integer` | A signed 32-bit integer. Minimum is −231. Maximum is 231 − 1. +`long` | A signed 64-bit integer. Minimum is −263. Maximum is 263 − 1. +[`unsigned_long`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/unsigned-long/) | An unsigned 64-bit integer. Minimum is 0. Maximum is 264 − 1. +`short` | A signed 16-bit integer. Minimum is −215. Maximum is 215 − 1. +[`scaled_float`](#scaled-float-field-type) | A floating-point value that is multiplied by the double scale factor and stored as a long value. + +Integer, long, float, and double field types have corresponding [range field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/range/). +{: .note } + +If your numeric field contains an identifier such as an ID, you can map this field as a [keyword]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/keyword/) to optimize for faster term-level queries. If you need to use range queries on this field, you can map this field as a numeric field type in addition to a keyword field type. +{: .tip } + +## Example + +Create a mapping where integer_value is an integer field: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "integer_value" : { + "type" : "integer" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with an integer value: + +```json +PUT testindex/_doc/1 +{ + "integer_value" : 123 +} +``` +{% include copy-curl.html %} + +## Scaled float field type + +A scaled float field type is a floating-point value that is multiplied by the scale factor and stored as a long value. It takes all optional parameters taken by number field types, plus an additional scaling_factor parameter. The scale factor is required when creating a scaled float. + +Scaled floats are useful for saving disk space. Larger scaling_factor values lead to better accuracy but higher space overhead. +{: .note } + +## Scaled float example + +Create a mapping where `scaled` is a scaled_float field: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "scaled" : { + "type" : "scaled_float", + "scaling_factor" : 10 + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with a scaled_float value: + +```json +PUT testindex/_doc/1 +{ + "scaled" : 2.3 +} +``` +{% include copy-curl.html %} + +The `scaled` value will be stored as 23. + +## Parameters + +The following table lists the parameters accepted by numeric field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`boost` | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field's relevance. Values between 0.0 and 1.0 decrease the field's relevance. Default is 1.0. +`coerce` | A Boolean value that signals to truncate decimals for integer values and to convert strings to numeric values. Default is `true`. +`doc_values` | A Boolean value that specifies whether the field should be stored on disk so that it can be used for aggregations, sorting, or scripting. Default is `true`. +`ignore_malformed` | A Boolean value that specifies to ignore malformed values and not to throw an exception. Default is `false`. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. +`meta` | Accepts metadata for this field. +[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. Must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. +`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the _source field. Default is `false`. + +Scaled float has an additional required parameter: `scaling_factor`. + +Parameter | Description +:--- | :--- +`scaling_factor` | A double value that is multiplied by the field value and rounded to the nearest long. Required. diff --git a/_field-types/supported-field-types/object-fields.md b/_field-types/supported-field-types/object-fields.md new file mode 100644 index 00000000..429c5b94 --- /dev/null +++ b/_field-types/supported-field-types/object-fields.md @@ -0,0 +1,23 @@ +--- +layout: default +title: Object field types +nav_order: 40 +has_children: true +has_toc: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/object-fields/ + - /field-types/object-fields/ +--- + +# Object field types + +Object field types contain values that are objects or relations. The following table lists all object field types that OpenSearch supports. + +Field data type | Description +:--- | :--- +[`object`]({{site.url}}{{site.baseurl}}/field-types/object/) | A JSON object. +[`nested`]({{site.url}}{{site.baseurl}}/field-types/nested/) | Used when objects in an array need to be indexed independently as separate documents. +[`flat_object`]({{site.url}}{{site.baseurl}}/field-types/flat-object/) | A JSON object treated as a string. +[`join`]({{site.url}}{{site.baseurl}}/field-types/join/) | Establishes a parent-child relationship between documents in the same index. + diff --git a/_field-types/supported-field-types/object.md b/_field-types/supported-field-types/object.md new file mode 100644 index 00000000..372a5c46 --- /dev/null +++ b/_field-types/supported-field-types/object.md @@ -0,0 +1,154 @@ +--- +layout: default +title: Object +nav_order: 41 +has_children: false +parent: Object field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/object/ + - /field-types/object/ +--- + +# Object field type + +An object field type contains a JSON object (a set of name/value pairs). A value in a JSON object may be another JSON object. It is not necessary to specify `object` as the type when mapping object fields because `object` is the default type. + +## Example + +Create a mapping with an object field: + +```json +PUT testindex1/_mappings +{ + "properties": { + "patient": { + "properties" : + { + "name" : { + "type" : "text" + }, + "id" : { + "type" : "keyword" + } + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with an object field: + +```json +PUT testindex1/_doc/1 +{ + "patient": { + "name" : "John Doe", + "id" : "123456" + } +} +``` +{% include copy-curl.html %} + +Nested objects are stored as flat key/value pairs internally. To refer to a field in a nested object, use `parent field`.`child field` (for example, `patient.id`). + +Search for a patient with ID 123456: + +```json +GET testindex1/_search +{ + "query": { + "term" : { + "patient.id" : "123456" + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by object field types. All parameters are optional. + +Parameter | Description +:--- | :--- +[`dynamic`](#the-dynamic-parameter) | Specifies whether new fields can be dynamically added to this object. Valid values are `true`, `false`, and `strict`. Default is `true`. +`enabled` | A Boolean value that specifies whether the JSON contents of the object should be parsed. If `enabled` is set to `false`, the object's contents are not indexed or searchable, but they are still retrievable from the _source field. Default is `true`. +`properties` | Fields of this object, which can be of any supported type. New properties can be dynamically added to this object if `dynamic` is set to `true`. + +### The `dynamic` parameter + +The `dynamic` parameter specifies whether new fields can be dynamically added to an object that is already indexed. + +For example, you can initially create a mapping with a `patient` object that has only one field: + +```json +PUT testindex1/_mappings +{ + "properties": { + "patient": { + "properties" : + { + "name" : { + "type" : "text" + } + } + } + } +} +``` +{% include copy-curl.html %} + +Then you index a document with a new `id` field in `patient`: + +```json +PUT testindex1/_doc/1 +{ + "patient": { + "name" : "John Doe", + "id" : "123456" + } +} +``` +{% include copy-curl.html %} + +As a result, the field `id` is added to the mappings: + +```json +{ + "testindex1" : { + "mappings" : { + "properties" : { + "patient" : { + "properties" : { + "id" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword", + "ignore_above" : 256 + } + } + }, + "name" : { + "type" : "text" + } + } + } + } + } + } +} +``` + +The `dynamic` parameter has the following valid values. + +Value | Description +:--- | :--- +`true` | New fields can be added to the mapping dynamically. This is the default. +`false` | New fields cannot be added to the mapping dynamically. If a new field is detected, it is not indexed or searchable. However, it is still retrievable from the _source field. +`strict` | When new fields are added to the mapping dynamically, an exception is thrown. To add a new field to an object, you have to add it to the mapping first. + +Inner objects inherit the `dynamic` parameter value from their parent unless they declare their own `dynamic` parameter value. +{: .note } diff --git a/_field-types/supported-field-types/percolator.md b/_field-types/supported-field-types/percolator.md new file mode 100644 index 00000000..92325b61 --- /dev/null +++ b/_field-types/supported-field-types/percolator.md @@ -0,0 +1,160 @@ +--- +layout: default +title: Percolator +nav_order: 65 +has_children: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/percolator/ + - /field-types/percolator/ +--- + +# Percolator field type + +A percolator field type specifies to treat this field as a query. Any JSON object field can be marked as a percolator field. Normally, documents are indexed and searches are run against them. When you use a percolator field, you store a search, and later the percolate query matches documents to that search. + +## Example + +A customer is searching for a table priced at $400 or less and wants to create an alert for this search. + +Create a mapping assigning a percolator field type to the query field: + +```json +PUT testindex1 +{ + "mappings": { + "properties": { + "search": { + "properties": { + "query": { + "type": "percolator" + } + } + }, + "price": { + "type": "float" + }, + "item": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a query: + +```json +PUT testindex1/_doc/1 +{ + "search": { + "query": { + "bool": { + "filter": [ + { + "match": { + "item": { + "query": "table" + } + } + }, + { + "range": { + "price": { + "lte": 400.00 + } + } + } + ] + } + } + } +} +``` +{% include copy-curl.html %} + +Fields referenced in the query must already exist in the mapping. +{: .note } + +Run a percolate query to search for matching documents: + +```json +GET testindex1/_search +{ + "query" : { + "bool" : { + "filter" : + { + "percolate" : { + "field" : "search.query", + "document" : { + "item" : "Mahogany table", + "price": 399.99 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the originally indexed query: + +```json +{ + "took" : 30, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 0.0, + "hits" : [ + { + "_index" : "testindex1", + "_type" : "_doc", + "_id" : "1", + "_score" : 0.0, + "_source" : { + "search" : { + "query" : { + "bool" : { + "filter" : [ + { + "match" : { + "item" : { + "query" : "table" + } + } + }, + { + "range" : { + "price" : { + "lte" : 400.0 + } + } + } + ] + } + } + } + }, + "fields" : { + "_percolator_document_slot" : [ + 0 + ] + } + } + ] + } +} +``` \ No newline at end of file diff --git a/_field-types/supported-field-types/range.md b/_field-types/supported-field-types/range.md new file mode 100644 index 00000000..22ae1d61 --- /dev/null +++ b/_field-types/supported-field-types/range.md @@ -0,0 +1,159 @@ +--- +layout: default +title: Range field types +nav_order: 35 +has_children: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/range/ + - /field-types/range/ +--- + +# Range field types + +The following table lists all range field types that OpenSearch supports. + +Field data type | Description +:--- | :--- +`integer_range` | A range of [integer]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/) values. +`long_range` | A range of [long]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/) values. +`double_range` | A range of [double]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/) values. +`float_range` | A range of [float]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/) values. +`ip_range` | A range of [IP addresses]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/ip/) in IPv4 or IPv6 format. Start and end IP addresses may be in different formats. +`date_range` | A range of [date]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/) values. Start and end dates may be in different [formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/#formats). Internally, all dates are stored as unsigned 64-bit integers representing milliseconds since the epoch. + +## Example + +Create a mapping with a double range and a date range: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "gpa" : { + "type" : "double_range" + }, + "graduation_date" : { + "type" : "date_range", + "format" : "strict_year_month||strict_year_month_day" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with a double range and a date range: + +```json +PUT testindex/_doc/1 +{ + "gpa" : { + "gte" : 1.0, + "lte" : 4.0 + }, + "graduation_date" : { + "gte" : "2019-05-01", + "lte" : "2019-05-15" + } +} +``` +{% include copy-curl.html %} + +## IP address ranges + +You can specify IP address ranges in two formats: as a range and in [CIDR notation](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing#CIDR_notation). + +Create a mapping with an IP address range: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "ip_address_range" : { + "type" : "ip_range" + }, + "ip_address_cidr" : { + "type" : "ip_range" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with IP address ranges in both formats: + +```json +PUT testindex/_doc/2 +{ + "ip_address_range" : { + "gte" : "10.24.34.0", + "lte" : "10.24.35.255" + }, + "ip_address_cidr" : "10.24.34.0/24" +} +``` +{% include copy-curl.html %} + +## Querying range fields + +You can use a [Term query](#term-query) or a [Range query](#range-query) to search for values within range fields. + +### Term query + +A term query takes a value and matches all range fields for which the value is within the range. + +The following query will return document 1 because 3.5 is within the range [1.0, 4.0]: + +```json +GET testindex/_search +{ + "query" : { + "term" : { + "gpa" : { + "value" : 3.5 + } + } + } +} +``` +{% include copy-curl.html %} + +### Range query + +A range query on a range field returns documents within that range. + +Query for all graduation dates in 2019, providing the date range in a "MM/dd/yyyy" format: + +```json +GET testindex1/_search +{ + "query": { + "range": { + "graduation_date": { + "gte": "01/01/2019", + "lte": "12/31/2019", + "format": "MM/dd/yyyy", + "relation" : "within" + } + } + } +} +``` +{% include copy-curl.html %} + +The preceding query will return document 1 for the `within` and `intersects` relations but will not return it for the `contains` relation. For more information about relation types, see [range query parameters]({{site.url}}{{site.baseurl}}/query-dsl/term/range#parameters). + +## Parameters + +The following table lists the parameters accepted by range field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`boost` | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field's relevance. Values between 0.0 and 1.0 decrease the field's relevance. Default is 1.0. +`coerce` | A Boolean value that signals to truncate decimals for integer values and to convert strings to numeric values. Default is `true`. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. +`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the _source field. Default is `false`. diff --git a/_field-types/supported-field-types/rank.md b/_field-types/supported-field-types/rank.md new file mode 100644 index 00000000..a4ec0fac --- /dev/null +++ b/_field-types/supported-field-types/rank.md @@ -0,0 +1,282 @@ +--- +layout: default +title: Rank field types +nav_order: 60 +has_children: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/rank/ + - /field-types/rank/ +--- + +# Rank field types + +The following table lists all rank field types that OpenSearch supports. + +Field data type | Description +:--- | :--- +[`rank_feature`](#rank-feature) | Boosts or decreases the relevance score of documents. +[`rank_features`](#rank-features) | Boosts or decreases the relevance score of documents. Used when the list of features is sparse. + +Rank feature and rank features fields can be queried with [rank feature queries](#rank-feature-query) only. They do not support aggregating or sorting. +{: .note } + +## Rank feature + +A rank feature field type uses a positive [float]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/) value to boost or decrease the relevance score of a document in a `rank_feature` query. By default, this value boosts the relevance score. To decrease the relevance score, set the optional `positive_score_impact` parameter to false. + +### Example + +Create a mapping with a rank feature field: + +```json +PUT chessplayers +{ + "mappings": { + "properties": { + "name" : { + "type" : "text" + }, + "rating": { + "type": "rank_feature" + }, + "age": { + "type": "rank_feature", + "positive_score_impact": false + } + } + } +} +``` +{% include copy-curl.html %} + +Index three documents with a rank_feature field that boosts the score (`rating`) and a rank_feature field that decreases the score (`age`): + +```json +PUT testindex1/_doc/1 +{ + "name" : "John Doe", + "rating" : 2554, + "age" : 75 +} +``` +{% include copy-curl.html %} + +```json +PUT testindex1/_doc/2 +{ + "name" : "Kwaku Mensah", + "rating" : 2067, + "age": 10 +} +``` +{% include copy-curl.html %} + +```json +PUT testindex1/_doc/3 +{ + "name" : "Nikki Wolf", + "rating" : 1864, + "age" : 22 +} +``` +{% include copy-curl.html %} + +## Rank feature query + +Using a rank feature query, you can rank players by rating, by age, or by both rating and age. If you rank players by rating, higher-rated players will have higher relevance scores. If you rank players by age, younger players will have higher relevance scores. + +Use a rank feature query to search for players based on age and rating: + +```json +GET chessplayers/_search +{ + "query": { + "bool": { + "should": [ + { + "rank_feature": { + "field": "rating" + } + }, + { + "rank_feature": { + "field": "age" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +When ranked by both age and rating, younger players and players who are more highly ranked score better: + +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.2093145, + "hits" : [ + { + "_index" : "chessplayers", + "_type" : "_doc", + "_id" : "2", + "_score" : 1.2093145, + "_source" : { + "name" : "Kwaku Mensah", + "rating" : 1967, + "age" : 10 + } + }, + { + "_index" : "chessplayers", + "_type" : "_doc", + "_id" : "3", + "_score" : 1.0150313, + "_source" : { + "name" : "Nikki Wolf", + "rating" : 1864, + "age" : 22 + } + }, + { + "_index" : "chessplayers", + "_type" : "_doc", + "_id" : "1", + "_score" : 0.8098284, + "_source" : { + "name" : "John Doe", + "rating" : 2554, + "age" : 75 + } + } + ] + } +} +``` + +## Rank features + +A rank features field type is similar to the rank feature field type, but it is more suitable for a sparse list of features. A rank features field can index numeric feature vectors that are later used to boost or decrease documents' relevance scores in `rank_feature` queries. + +### Example + +Create a mapping with a rank features field: + +```json +PUT testindex1 +{ + "mappings": { + "properties": { + "correlations": { + "type": "rank_features" + } + } + } +} +``` +{% include copy-curl.html %} + +To index a document with a rank features field, use a hashmap with string keys and positive float values: + +```json +PUT testindex1/_doc/1 +{ + "correlations": { + "young kids" : 1, + "older kids" : 15, + "teens" : 25.9 + } +} +``` +{% include copy-curl.html %} + +```json +PUT testindex1/_doc/2 +{ + "correlations": { + "teens": 10, + "adults": 95.7 + } +} +``` +{% include copy-curl.html %} + +Query the documents using a rank feature query: + +```json +GET testindex1/_search +{ + "query": { + "rank_feature": { + "field": "correlations.teens" + } + } +} +``` +{% include copy-curl.html %} + +The response is ranked by relevance score: + +```json +{ + "took" : 123, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 0.6258503, + "hits" : [ + { + "_index" : "testindex1", + "_type" : "_doc", + "_id" : "1", + "_score" : 0.6258503, + "_source" : { + "correlations" : { + "young kids" : 1, + "older kids" : 15, + "teens" : 25.9 + } + } + }, + { + "_index" : "testindex1", + "_type" : "_doc", + "_id" : "2", + "_score" : 0.39263803, + "_source" : { + "correlations" : { + "teens" : 10, + "adults" : 95.7 + } + } + } + ] + } +} +``` + +Rank feature and rank features fields use top nine significant bits for precision, leading to about 0.4% relative error. Values are stored with a relative precision of 2−8 = 0.00390625. +{: .note } diff --git a/_field-types/supported-field-types/search-as-you-type.md b/_field-types/supported-field-types/search-as-you-type.md new file mode 100644 index 00000000..b9141e6b --- /dev/null +++ b/_field-types/supported-field-types/search-as-you-type.md @@ -0,0 +1,211 @@ +--- +layout: default +title: Search as you type +nav_order: 53 +has_children: false +parent: Autocomplete field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/search-as-you-type/ + - /field-types/search-as-you-type/ +--- + +# Search-as-you-type field type + +A search-as-you-type field type provides search-as-you-type functionality using both prefix and infix completion. + +## Example + +Mapping a search-as-you-type field creates n-gram subfields of this field, where n is in the range [2, `max_shingle_size`]. Additionally, it creates an index prefix subfield. + +Create a mapping with a search-as-you-type field: + +```json +PUT books +{ + "mappings": { + "properties": { + "suggestions": { + "type": "search_as_you_type" + } + } + } +} +``` +{% include copy-curl.html %} + +In addition to the `suggestions` field, this creates `suggestions._2gram`, `suggestions._3gram`, and `suggestions._index_prefix` fields. + +Index a document with a search-as-you-type field: + +```json +PUT books/_doc/1 +{ + "suggestions": "one two three four" +} +``` +{% include copy-curl.html %} + +To match terms in any order, use a bool_prefix or multi-match query. These queries rank the documents in which search terms are in the specified order higher than the documents in which terms are out of order. + +```json +GET books/_search +{ + "query": { + "multi_match": { + "query": "tw one", + "type": "bool_prefix", + "fields": [ + "suggestions", + "suggestions._2gram", + "suggestions._3gram" + ] + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching document: + +```json +{ + "took" : 13, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "books", + "_type" : "_doc", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "suggestions" : "one two three four" + } + } + ] + } +} +``` + +To match terms in order, use a match_phrase_prefix query: + +```json +GET books/_search +{ + "query": { + "match_phrase_prefix": { + "suggestions": "two th" + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching document: + +```json +{ + "took" : 23, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 0.4793051, + "hits" : [ + { + "_index" : "books", + "_type" : "_doc", + "_id" : "1", + "_score" : 0.4793051, + "_source" : { + "suggestions" : "one two three four" + } + } + ] + } +} +``` + +To match the last terms exactly, use a match_phrase query: + +```json +GET books/_search +{ + "query": { + "match_phrase": { + "suggestions": "four" + } + } +} +``` +{% include copy-curl.html %} + +Response: + +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 0.2876821, + "hits" : [ + { + "_index" : "books", + "_type" : "_doc", + "_id" : "1", + "_score" : 0.2876821, + "_source" : { + "suggestions" : "one two three four" + } + } + ] + } +} +``` + +## Parameters + +The following table lists the parameters accepted by search-as-you-type field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`analyzer` | The analyzer to be used for this field. By default, it will be used at index time and at search time. To override it at search time, set the `search_analyzer` parameter. Default is the `standard` analyzer, which uses grammar-based tokenization and is based on the [Unicode Text Segmentation](https://unicode.org/reports/tr29/) algorithm. Configures the root field and subfields. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. Configures the root field and subfields. +`index_options` | Specifies the information to be stored in the index for search and highlighting. Valid values: `docs` (doc number only), `freqs` (doc number and term frequencies), `positions` (doc number, term frequencies, and term positions), `offsets` (doc number, term frequencies, term positions, and start and end character offsets). Default is `positions`. Configures the root field and subfields. +`max_shingle_size` | An integer that specifies the maximum n-gram size. Valid values are in the range [2, 4]. N-grams to be created are in the range [2, `max_shingle_size`]. Default is 3, which creates a 2-gram and a 3-gram. Larger `max_shingle_size` values work better for more specific queries but lead to a larger index size. +`norms` | A Boolean value that specifies whether the field length should be used when calculating relevance scores. Configures the root field and n-gram subfields (default is `false`). Does not configure the prefix subfield (in the prefix subfield, `norms` is `false`). +`search_analyzer` | The analyzer to be used at search time. Default is the analyzer specified in the `analyzer` parameter. Configures the root field and subfields. +`search_quote_analyzer` | The analyzer to be used at search time with phrases. Default is the analyzer specified in the `analyzer` parameter. Configures the root field and subfields. +`similarity` | The ranking algorithm for calculating relevance scores. Default is `BM25`. Configures the root field and subfields. +`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the _source field. Default is `false`. Configures the root field only. +[`term_vector`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/text#term-vector-parameter) | A Boolean value that specifies whether a term vector for this field should be stored. Default is `no`. Configures the root field and n-gram subfields. Does not configure the prefix subfield. diff --git a/_field-types/supported-field-types/string.md b/_field-types/supported-field-types/string.md new file mode 100644 index 00000000..c891f86c --- /dev/null +++ b/_field-types/supported-field-types/string.md @@ -0,0 +1,22 @@ +--- +layout: default +title: String field types +nav_order: 45 +has_children: true +has_toc: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/string/ + - /field-types/string/ +--- + +# String field types + +String field types contain text values or values derived from text. The following table lists all string field types that OpenSearch supports. + +Field data type | Description +:--- | :--- +[`keyword`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/keyword/) | A string that is not analyzed. Useful for exact-value search. +[`text`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/text/) | A string that is analyzed. Useful for full-text search. +[`match_only_text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/match-only-text/) | A space-optimized version of a `text` field. +[`token_count`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/token-count/) | Counts the number of tokens in a string. diff --git a/_field-types/supported-field-types/text.md b/_field-types/supported-field-types/text.md new file mode 100644 index 00000000..16350c0c --- /dev/null +++ b/_field-types/supported-field-types/text.md @@ -0,0 +1,171 @@ +--- +layout: default +title: Text +nav_order: 47 +has_children: false +parent: String field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/text/ + - /field-types/text/ +--- + +# Text field type + +A `text` field type contains a string that is analyzed. It is used for full-text search because it allows partial matches. Searches for multiple terms can match some but not all of them. Depending on the analyzer, results can be case insensitive, stemmed, have stopwords removed, have synonyms applied, and so on. + + +If you need to use a field for exact-value search, map it as a [`keyword`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/keyword/) instead. +{: .note } + +The [`match_only_text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/match-only-text/) field is a space-optimized version of the `text` field. If you don't need to query phrases or use positional queries, map the field as `match_only_text` instead of `text`. Positional queries are queries in which the position of the term in the phrase is important, such as interval or span queries. +{: .note} + +## Example + +Create a mapping with a text field: + +```json +PUT movies +{ + "mappings" : { + "properties" : { + "title" : { + "type" : "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by text field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`analyzer` | The analyzer to be used for this field. By default, it will be used at index time and at search time. To override it at search time, set the `search_analyzer` parameter. Default is the `standard` analyzer, which uses grammar-based tokenization and is based on the [Unicode Text Segmentation](https://unicode.org/reports/tr29/) algorithm. +`boost` | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field's relevance. Values between 0.0 and 1.0 decrease the field's relevance. Default is 1.0. +`eager_global_ordinals` | Specifies whether global ordinals should be loaded eagerly on refresh. If the field is often used for aggregations, this parameter should be set to `true`. Default is `false`. +`fielddata` | A Boolean value that specifies whether to access analyzed tokens for this field for sorting, aggregation, and scripting. Default is `false`. +`fielddata_frequency_filter` | A JSON object that specifies to load into memory only those analyzed tokens whose document frequency is between the `min` and `max` values (provided as either an absolute number or a percentage). Frequency is computed per segment. Parameters: `min`, `max`, `min_segment_size`. Default is to load all analyzed tokens. +`fields` | To index the same string in several ways (for example, as a keyword and text), provide the fields parameter. You can specify one version of the field to be used for search and another to be used for sorting and aggregations. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. +`index_options` | Specifies the information to be stored in the index for search and highlighting. Valid values: `docs` (doc number only), `freqs` (doc number and term frequencies), `positions` (doc number, term frequencies, and term positions), `offsets` (doc number, term frequencies, term positions, and start and end character offsets). Default is `positions`. +`index_phrases` | A Boolean value that specifies to index 2-grams separately. 2-grams are combinations of two consecutive words in this field's string. Leads to faster exact phrase queries with no slop but a larger index. Works best when stopwords are not removed. Default is `false`. +`index_prefixes` | A JSON object that specifies to index term prefixes separately. The number of characters in the prefix is between `min_chars` and `max_chars`, inclusive. Leads to faster prefix searches but a larger index. Optional parameters: `min_chars`, `max_chars`. Default `min_chars` is 2, `max_chars` is 5. +`meta` | Accepts metadata for this field. +`norms` | A Boolean value that specifies whether the field length should be used when calculating relevance scores. Default is `false`. +`position_increment_gap` | When text fields are analyzed, they are assigned positions. If a field contained an array of strings, and these positions were consecutive, this would lead to potentially matching across different array elements. To prevent this, an artificial gap is inserted between consecutive array elements. You can change this gap by specifying an integer `position_increment_gap`. Note: If `slop` is greater than `position_element_gap`, matching across different array elements may occur. Default is 100. +`similarity` | The ranking algorithm for calculating relevance scores. Default is `BM25`. +[`term_vector`](#term-vector-parameter) | A Boolean value that specifies whether a term vector for this field should be stored. Default is `no`. + +## Term vector parameter + +A term vector is produced during analysis. It contains: +- A list of terms. +- The ordinal position of each term. +- The start and end character offsets of the search string within the field. +- Payloads (if available). Each term can have custom binary data associated with the term's position. + +The `term_vector` field contains a JSON object that accepts the following parameters: + +Parameter | Stored values +:--- | :--- +`no` | None. This is the default. +`yes` | Terms in the field. +`with_offsets` | Terms and character offsets. +`with_positions_offsets` | Terms, positions, and character offsets. +`with_positions_offsets_payloads` | Terms, positions, character offsets, and payloads. +`with_positions` | Terms and positions. +`with_positions_payloads` | Terms, positions, and payloads. + +Storing positions is useful for proximity queries. Storing character offsets is useful for highlighting. +{: .tip } + +### Term vector parameter example + +Create a mapping with a text field that stores character offsets in a term vector: + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "dob" : { + "type" : "text", + "term_vector": "with_positions_offsets" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with a text field: + +```json +PUT testindex/_doc/1 +{ + "dob" : "The patient's date of birth." +} +``` +{% include copy-curl.html %} + +Query for "date of birth" and highlight it in the original field: + +```json +GET testindex/_search +{ + "query": { + "match": { + "text": "date of birth" + } + }, + "highlight": { + "fields": { + "text": {} + } + } +} +``` +{% include copy-curl.html %} + +The words "date of birth" are highlighted in the response: + +```json +{ + "took" : 854, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 0.8630463, + "hits" : [ + { + "_index" : "testindex", + "_type" : "_doc", + "_id" : "1", + "_score" : 0.8630463, + "_source" : { + "text" : "The patient's date of birth." + }, + "highlight" : { + "text" : [ + "The patient's date of birth." + ] + } + } + ] + } +} +``` \ No newline at end of file diff --git a/_field-types/supported-field-types/token-count.md b/_field-types/supported-field-types/token-count.md new file mode 100644 index 00000000..6c3445e6 --- /dev/null +++ b/_field-types/supported-field-types/token-count.md @@ -0,0 +1,122 @@ +--- +layout: default +title: Token count +nav_order: 70 +has_children: false +parent: String field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/token-count/ + - /field-types/token-count/ +--- + +# Token count field type + +A token count field type stores the number of analyzed tokens in a string. + +## Example + +Create a mapping with a token count field: + +```json +PUT testindex +{ + "mappings": { + "properties": { + "sentence": { + "type": "text", + "fields": { + "num_words": { + "type": "token_count", + "analyzer": "english" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Index three documents with text fields: + +```json +PUT testindex/_doc/1 +{ "sentence": "To be, or not to be: that is the question." } +``` +{% include copy-curl.html %} + +```json +PUT testindex/_doc/2 +{ "sentence": "All the world’s a stage, and all the men and women are merely players." } +``` +{% include copy-curl.html %} + +```json +PUT testindex/_doc/3 +{ "sentence": "Now is the winter of our discontent." } +``` +{% include copy-curl.html %} + +Search for sentences with fewer than 10 words: + +```json +GET testindex/_search +{ + "query": { + "range": { + "sentence.num_words": { + "lt": 10 + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains one matching sentence: + +```json +{ + "took" : 8, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "testindex", + "_type" : "_doc", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "sentence" : "Now is the winter of our discontent." + } + } + ] + } +} +``` + +## Parameters + +The following table lists the parameters accepted by token count field types. The `analyzer` parameter is required; all other parameters are optional. + +Parameter | Description +:--- | :--- +`analyzer` | The analyzer to be used for this field. Specify an analyzer without token filters for optimal performance. Required. +`boost` | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field's relevance. Values between 0.0 and 1.0 decrease the field's relevance. Default is 1.0. +`doc_values` | A Boolean value that specifies whether the field should be stored on disk so that it can be used for aggregations, sorting, or scripting. Default is `false`. +`enable_position_increments` | A Boolean value that specifies whether position increments should be counted. To avoid removing stopwords, set this field to `false`. Default is `true`. +`index` | A Boolean value that specifies whether the field should be searchable. Default is `true`. +[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. Must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. +`store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the _source field. Default is `false`. diff --git a/_field-types/supported-field-types/unsigned-long.md b/_field-types/supported-field-types/unsigned-long.md new file mode 100644 index 00000000..dde8d25d --- /dev/null +++ b/_field-types/supported-field-types/unsigned-long.md @@ -0,0 +1,164 @@ +--- +layout: default +title: Unsigned long +parent: Numeric field types +grand_parent: Supported field types +nav_order: 15 +has_children: false +--- + +# Unsigned long field type + +The `unsigned_long` field type is a numeric field type that represents an unsigned 64-bit integer with a minimum value of 0 and a maximum value of 264 − 1. In the following example, `counter` is mapped as an `unsigned_long` field: + + +```json +PUT testindex +{ + "mappings" : { + "properties" : { + "counter" : { + "type" : "unsigned_long" + } + } + } +} +``` +{% include copy-curl.html %} + +## Indexing + +To index a document with an `unsigned_long` value, use the following request: + +```json +PUT testindex/_doc/1 +{ + "counter" : 10223372036854775807 +} +``` +{% include copy-curl.html %} + +Alternatively, you can use the [Bulk API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) as follows: + +```json +POST _bulk +{ "index": { "_index": "testindex", "_id": "1" } } +{ "counter": 10223372036854775807 } +``` +{% include copy-curl.html %} + +If a field of type `unsigned_long` has the `store` parameter set to `true` (that is, the field is a stored field), it will be stored and returned as a string. `unsigned_long` values do not support the decimal part, so, if supplied, the decimal part is truncated. +{: .note} + +## Querying + +`unsigned_long` fields support most of the queries that other numeric types support. For example, you can use a term query on `unsigned_long` fields: + +```json +POST _search +{ + "query": { + "term": { + "counter": { + "value": 10223372036854775807 + } + } + } +} +``` +{% include copy-curl.html %} + +You can also use a range query: + +```json +POST _search +{ + "query": { + "range": { + "counter": { + "gte": 10223372036854775807 + } + } + } +} +``` +{% include copy-curl.html %} + +## Sorting + +You can use `sort` values with `unsigned_long` fields to order the search results, for example: + +```json +POST _search +{ + "sort" : [ + { + "counter" : { + "order" : "asc" + } + } + ], + "query": { + "range": { + "counter": { + "gte": 10223372036854775807 + } + } + } +} +``` +{% include copy-curl.html %} + + +An `unsigned_long` field cannot be used as an index sort field (in the `sort.field` index setting). +{: .warning} + +## Aggregations + +Like other numeric fields, `unsigned_long` fields support aggregations. For `terms` and `multi_terms` aggregations, `unsigned_long` values are used as is, but for other aggregation types, the values are converted to the `double` type (with possible loss of precision). The following is an example of the `terms` aggregation: + +```json +POST _search +{ + "query": { + "match_all": {} + }, + "aggs": { + "counters": { + "terms": { + "field": "counter" + } + } + } +} +``` +{% include copy-curl.html %} + +## Scripting + +In scripts, `unsigned_long` fields are returned as instances of the `BigInteger` class: + +```json +POST _search +{ + "query": { + "bool": { + "filter": { + "script": { + "script": "BigInteger amount = doc['counter'].value; return amount.compareTo(BigInteger.ZERO) > 0;" + } + } + } + } +} +``` +{% include copy-curl.html %} + + +## Limitations + +Note the following limitations of the `unsigned_long` field type: + +- When aggregations are performed across different numeric types and one of the types is `unsigned_long`, the values are converted to the `double` type and `double` arithmetic is used, with high likelihood of precision loss. + +- An `unsigned_long` field cannot be used as an index sort field (in the `sort.field` index setting). This limitation also applies when a search is performed on multiple indexes and the results are sorted by the field that has the `unsigned_long` type in at least one of the indexes but a different numeric type or types in others. \ No newline at end of file diff --git a/_field-types/supported-field-types/xy-point.md b/_field-types/supported-field-types/xy-point.md new file mode 100644 index 00000000..57b6f647 --- /dev/null +++ b/_field-types/supported-field-types/xy-point.md @@ -0,0 +1,106 @@ +--- +layout: default +title: xy point +nav_order: 58 +has_children: false +parent: Cartesian field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/xy-point/ + - /field-types/xy-point/ +--- + +# xy point field type + +An xy point field type contains a point in a two-dimensional Cartesian coordinate system, specified by x and y coordinates. It is based on the Lucene [XYPoint](https://lucene.apache.org/core/9_3_0/core/org/apache/lucene/geo/XYPoint.html) field type. The xy point field type is similar to the [geopoint]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point/) field type, but does not have the range limitations of geopoint. The coordinates of an xy point are single-precision floating-point values. For information about the range and precision of floating-point values, see [Numeric field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/). + +## Example + +Create a mapping with an xy point field type: + +```json +PUT testindex1 +{ + "mappings": { + "properties": { + "point": { + "type": "xy_point" + } + } + } +} +``` +{% include copy-curl.html %} + +## Formats + +xy points can be indexed in the following formats: + +- An object with x and y coordinates + +```json +PUT testindex1/_doc/1 +{ + "point": { + "x": 0.5, + "y": 4.5 + } +} +``` +{% include copy-curl.html %} + +- A string in the "`x`, `y`" format + +```json +PUT testindex1/_doc/2 +{ + "point": "0.5, 4.5" +} +``` +{% include copy-curl.html %} + +- An array in the [`x`, `y`] format + +```json +PUT testindex1/_doc/3 +{ + "point": [0.5, 4.5] +} +``` +{% include copy-curl.html %} + +- A [well-known text (WKT)](https://docs.opengeospatial.org/is/12-063r5/12-063r5.html) POINT in the "POINT(`x` `y`)" format + +```json +PUT testindex1/_doc/4 +{ + "point": "POINT (0.5 4.5)" +} +``` +{% include copy-curl.html %} + +- GeoJSON format + +```json +PUT testindex1/_doc/5 +{ + "point" : { + "type" : "Point", + "coordinates" : [0.5, 4.5] + } +} +``` +{% include copy-curl.html %} + +In all xy point formats, the coordinates must be specified in the `x, y` order. +{: .note} + +## Parameters + +The following table lists the parameters accepted by xy point field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`ignore_malformed` | A Boolean value that specifies to ignore malformed values and not to throw an exception. Default is `false`. +`ignore_z_value` | Specific to points with three coordinates. If `ignore_z_value` is `true`, the third coordinate is not indexed but is still stored in the _source field. If `ignore_z_value` is `false`, an exception is thrown. +[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. The value must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. \ No newline at end of file diff --git a/_field-types/supported-field-types/xy-shape.md b/_field-types/supported-field-types/xy-shape.md new file mode 100644 index 00000000..f1c71912 --- /dev/null +++ b/_field-types/supported-field-types/xy-shape.md @@ -0,0 +1,404 @@ +--- +layout: default +title: xy shape +nav_order: 59 +has_children: false +parent: Cartesian field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/xy-shape/ + - /field-types/xy-shape/ +--- + +# xy shape field type + +An xy shape field type contains a shape, such as a polygon or a collection of xy points. It is based on the Lucene [XYShape](https://lucene.apache.org/core/9_3_0/core/org/apache/lucene/document/XYShape.html) field type. To index an xy shape, OpenSearch tessellates the shape into a triangular mesh and stores each triangle in a BKD tree (a set of balanced k-dimensional trees). This provides a 10-7decimal degree of precision, which represents near-perfect spatial resolution. + +The xy shape field type is similar to the [geoshape]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-shape/) field type, but it represents shapes on the Cartesian plane, which is not based on the Earth-fixed terrestrial reference system. The coordinates of an xy shape are single-precision floating-point values. For information about the range and precision of floating-point values, see [Numeric field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/). + +## Example + +Create a mapping with an xy shape field type: + +```json +PUT testindex +{ + "mappings": { + "properties": { + "location": { + "type": "xy_shape" + } + } + } +} +``` +{% include copy-curl.html %} + +## Formats + +xy shapes can be indexed in the following formats: + +- [GeoJSON](https://geojson.org/) +- [Well-known text (WKT)](https://docs.opengeospatial.org/is/12-063r5/12-063r5.html) + +In both GeoJSON and WKT, the coordinates must be specified in the `x, y` order within coordinate arrays. +{: .note} + +## xy shape types + +The following table describes the possible xy shape types and their relationship to the GeoJSON and WKT types. + +OpenSearch type | GeoJSON type | WKT type | Description +:--- | :--- | :--- | :--- +[`point`](#point) | Point | POINT | A geographic point specified by the x and y coordinates. +[`linestring`](#linestring) | LineString | LINESTRING | A line specified by two or more points. May be a straight line or a path of connected line segments. +[`polygon`](#polygon) | Polygon | POLYGON | A polygon specified by a list of vertices in coordinate form. The polygon must be closed, meaning the last point must be the same as the first point. Therefore, to create an n-gon, n+1 vertices are required. The minimum number of vertices is four, which creates a triangle. +[`multipoint`](#multipoint) | MultiPoint | MULTIPOINT | An array of discrete related points that are not connected. +[`multilinestring`](#multilinestring) | MultiLineString | MULTILINESTRING | An array of linestrings. +[`multipolygon`](#multipolygon) | MultiPolygon | MULTIPOLYGON | An array of polygons. +[`geometrycollection`](#geometry-collection) | GeometryCollection | GEOMETRYCOLLECTION | A collection of xy shapes that may be of different types. +[`envelope`](#envelope) | N/A | BBOX | A bounding rectangle specified by upper-left and lower-right vertices. + +## Point + +A point is specified by a single pair of coordinates. + +Index a point in GeoJSON format: + +```json +PUT testindex/_doc/1 +{ + "location" : { + "type" : "point", + "coordinates" : [0.5, 4.5] + } +} +``` +{% include copy-curl.html %} + +Index a point in WKT format: + +```json +PUT testindex/_doc/1 +{ + "location" : "POINT (0.5 4.5)" +} +``` +{% include copy-curl.html %} + +## Linestring + +A linestring is a line specified by two or more points. If the points are collinear, the linestring is a straight line. Otherwise, the linestring represents a path made of line segments. + +Index a linestring in GeoJSON format: + +```json +PUT testindex/_doc/2 +{ + "location" : { + "type" : "linestring", + "coordinates" : [[0.5, 4.5], [-1.5, 2.3]] + } +} +``` +{% include copy-curl.html %} + +Index a linestring in WKT format: + +```json +PUT testindex/_doc/2 +{ + "location" : "LINESTRING (0.5 4.5, -1.5 2.3)" +} +``` +{% include copy-curl.html %} + +## Polygon + +A polygon is specified by a list of vertices in coordinate form. The polygon must be closed, meaning the last point must be the same as the first point. In the following example, a triangle is created using four points. + +GeoJSON requires that you list the vertices of the polygon counterclockwise. WKT does not impose a specific order on vertices. +{: .note} + +Index a polygon (triangle) in GeoJSON format: + +```json +PUT testindex/_doc/3 +{ + "location" : { + "type" : "polygon", + "coordinates" : [ + [[0.5, 4.5], + [2.5, 6.0], + [1.5, 2.0], + [0.5, 4.5]] + ] + } +} +``` +{% include copy-curl.html %} + +Index a polygon (triangle) in WKT format: + +```json +PUT testindex/_doc/3 +{ + "location" : "POLYGON ((0.5 4.5, 2.5 6.0, 1.5 2.0, 0.5 4.5))" +} +``` +{% include copy-curl.html %} + +The polygon may have holes inside. In this case, the `coordinates` field will contain multiple arrays. The first array represents the outer polygon, and each subsequent array represents a hole. Holes are represented as polygons and specified as arrays of coordinates. + +GeoJSON requires that you list the vertices of the polygon counterclockwise and the vertices of the hole clockwise. WKT does not impose a specific order on vertices. +{: .note} + +Index a polygon (triangle) with a triangular hole in GeoJSON format: + +```json +PUT testindex/_doc/4 +{ + "location" : { + "type" : "polygon", + "coordinates" : [ + [[0.5, 4.5], + [2.5, 6.0], + [1.5, 2.0], + [0.5, 4.5]], + + [[1.0, 4.5], + [1.5, 4.5], + [1.5, 4.0], + [1.0, 4.5]] + ] + } +} +``` +{% include copy-curl.html %} + +Index a polygon (triangle) with a triangular hole in WKT format: + +```json +PUT testindex/_doc/4 +{ + "location" : "POLYGON ((0.5 4.5, 2.5 6.0, 1.5 2.0, 0.5 4.5), (1.0 4.5, 1.5 4.5, 1.5 4.0, 1.0 4.5))" +} +``` +{% include copy-curl.html %} + +By default, the vertices of the polygon are traversed in a counterclockwise order. You can define an [`orientation`](#parameters) parameter to specify the vertex traversal order at mapping time: + +```json +PUT testindex +{ + "mappings": { + "properties": { + "location": { + "type": "xy_shape", + "orientation" : "left" + } + } + } +} +``` +{% include copy-curl.html %} + +Subsequently indexed documents can override the `orientation` setting: + +```json +PUT testindex/_doc/3 +{ + "location" : { + "type" : "polygon", + "orientation" : "cw", + "coordinates" : [ + [[0.5, 4.5], + [2.5, 6.0], + [1.5, 2.0], + [0.5, 4.5]] + ] + } +} +``` +{% include copy-curl.html %} + +## Multipoint + +A multipoint is an array of discrete related points that are not connected. + +Index a multipoint in GeoJSON format: + +```json +PUT testindex/_doc/6 +{ + "location" : { + "type" : "multipoint", + "coordinates" : [ + [0.5, 4.5], + [2.5, 6.0] + ] + } +} +``` +{% include copy-curl.html %} + +Index a multipoint in WKT format: + +```json +PUT testindex/_doc/6 +{ + "location" : "MULTIPOINT (0.5 4.5, 2.5 6.0)" +} +``` +{% include copy-curl.html %} + +## Multilinestring + +A multilinestring is an array of linestrings. + +Index a multilinestring in GeoJSON format: + +```json +PUT testindex/_doc/2 +{ + "location" : { + "type" : "multilinestring", + "coordinates" : [ + [[0.5, 4.5], [2.5, 6.0]], + [[1.5, 2.0], [3.5, 3.5]] + ] + } +} +``` +{% include copy-curl.html %} + +Index a linestring in WKT format: + +```json +PUT testindex/_doc/2 +{ + "location" : "MULTILINESTRING ((0.5 4.5, 2.5 6.0), (1.5 2.0, 3.5 3.5))" +} +``` +{% include copy-curl.html %} + +## Multipolygon + +A multipolygon is an array of polygons. In this example, the first polygon contains a hole, and the second does not. + +Index a multipolygon in GeoJSON format: + +```json +PUT testindex/_doc/4 +{ + "location" : { + "type" : "multipolygon", + "coordinates" : [ + [ + [[0.5, 4.5], + [2.5, 6.0], + [1.5, 2.0], + [0.5, 4.5]], + + [[1.0, 4.5], + [1.5, 4.5], + [1.5, 4.0], + [1.0, 4.5]] + ], + [ + [[2.0, 0.0], + [1.0, 2.0], + [3.0, 1.0], + [2.0, 0.0]] + ] + ] + } +} +``` +{% include copy-curl.html %} + +Index a multipolygon in WKT format: + +```json +PUT testindex/_doc/4 +{ + "location" : "MULTIPOLYGON (((0.5 4.5, 2.5 6.0, 1.5 2.0, 0.5 4.5), (1.0 4.5, 1.5 4.5, 1.5 4.0, 1.0 4.5)), ((2.0 0.0, 1.0 2.0, 3.0 1.0, 2.0 0.0)))" +} +``` +{% include copy-curl.html %} + +## Geometry collection + +A geometry collection is a collection of xy shapes that may be of different types. + +Index a geometry collection in GeoJSON format: + +```json +PUT testindex/_doc/7 +{ + "location" : { + "type": "geometrycollection", + "geometries": [ + { + "type": "point", + "coordinates": [0.5, 4.5] + }, + { + "type": "linestring", + "coordinates": [[2.5, 6.0], [1.5, 2.0]] + } + ] + } +} +``` +{% include copy-curl.html %} + +Index a geometry collection in WKT format: + +```json +PUT testindex/_doc/7 +{ + "location" : "GEOMETRYCOLLECTION (POINT (0.5 4.5), LINESTRING(2.5 6.0, 1.5 2.0))" +} +``` +{% include copy-curl.html %} + +## Envelope + +An envelope is a bounding rectangle specified by upper-left and lower-right vertices. The GeoJSON format is `[[minX, maxY], [maxX, minY]]`. + +Index an envelope in GeoJSON format: + +```json +PUT testindex/_doc/2 +{ + "location" : { + "type" : "envelope", + "coordinates" : [[3.0, 2.0], [6.0, 0.0]] + } +} +``` +{% include copy-curl.html %} + +In WKT format, use `BBOX (minX, maxY, maxX, minY)`. + +Index an envelope in WKT BBOX format: + +```json +PUT testindex/_doc/8 +{ + "location" : "BBOX (3.0, 2.0, 6.0, 0.0)" +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by xy shape field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`coerce` | A Boolean value that specifies whether to automatically close unclosed linear rings. Default is `false`. +`ignore_malformed` | A Boolean value that specifies to ignore malformed GeoJSON or WKT xy shapes and not to throw an exception. Default is `false` (throw an exception when xy shapes are malformed). +`ignore_z_value` | Specific to points with three coordinates. If `ignore_z_value` is `true`, the third coordinate is not indexed but is still stored in the _source field. If `ignore_z_value` is `false`, an exception is thrown. Default is `true`. +`orientation` | Specifies the traversal order of the vertices in the xy shape's list of coordinates. `orientation` takes the following values:
1. RIGHT: counterclockwise. Specify RIGHT orientation by using one of the following strings (uppercase or lowercase): `right`, `counterclockwise`, `ccw`.
2. LEFT: clockwise. Specify LEFT orientation by using one of the following strings (uppercase or lowercase): `left`, `clockwise`, `cw`. This value can be overridden by individual documents.
Default is `RIGHT`. \ No newline at end of file diff --git a/_field-types/supported-field-types/xy.md b/_field-types/supported-field-types/xy.md new file mode 100644 index 00000000..a6f8a45a --- /dev/null +++ b/_field-types/supported-field-types/xy.md @@ -0,0 +1,29 @@ +--- +layout: default +title: Cartesian field types +nav_order: 57 +has_children: true +has_toc: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/xy/ + - /field-types/xy/ +--- + +# Cartesian field types + +Cartesian field types facilitate indexing and searching of points and shapes in a two-dimensional Cartesian coordinate system. Cartesian field types are similar to [geographic]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geographic/) field types, except they represent points and shapes on the Cartesian plane, which is not based on the Earth-fixed terrestrial reference system. Calculating distances on a plane is more efficient than calculating distances on a sphere, so distance sorting is faster for Cartesian field types. + +Cartesian field types work well for spatial applications like virtual reality, computer-aided design (CAD), and amusement park and sporting venue mapping. + +The coordinates for the Cartesian field types are single-precision floating-point values. For information about the range and precision of floating-point values, see [Numeric field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/). + +The following table lists all Cartesian field types that OpenSearch supports. + +Field Data type | Description +:--- | :--- +[`xy_point`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/xy-point/) | A point in a two-dimensional Cartesian coordinate system, specified by x and y coordinates. +[`xy_shape`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/xy-shape/) | A shape, such as a polygon or a collection of xy points, in a two-dimensional Cartesian coordinate system. + +Currently, OpenSearch supports indexing and searching of Cartesian field types but not aggregations on Cartesian field types. If you'd like to see aggregations implemented, open a [GitHub issue](https://github.com/opensearch-project/geospatial). +{: .note} \ No newline at end of file diff --git a/_opensearch/data-streams.md b/_im-plugin/data-streams.md similarity index 85% rename from _opensearch/data-streams.md rename to _im-plugin/data-streams.md index ff06d6e1..d59526b7 100644 --- a/_opensearch/data-streams.md +++ b/_im-plugin/data-streams.md @@ -8,18 +8,18 @@ nav_order: 13 If you're ingesting continuously generated time-series data such as logs, events, and metrics into OpenSearch, you're likely in a scenario where the number of documents grows rapidly and you don't need to update older documents. -A typical workflow to manage time-series data involves multiple steps, such as creating a rollover index alias, defining a write index, and defining common mappings and settings for the backing indices. +A typical workflow to manage time-series data involves multiple steps, such as creating a rollover index alias, defining a write index, and defining common mappings and settings for the backing indexes. Data streams simplify this process and enforce a setup that best suits time-series data, such as being designed primarily for append-only data and ensuring that each document has a timestamp field. -A data stream is internally composed of multiple backing indices. Search requests are routed to all the backing indices, while indexing requests are routed to the latest write index. [ISM]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) policies let you automatically handle index rollovers or deletions. +A data stream is internally composed of multiple backing indexes. Search requests are routed to all the backing indexes, while indexing requests are routed to the latest write index. [ISM]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) policies let you automatically handle index rollovers or deletions. ## Get started with data streams ### Step 1: Create an index template -To create a data stream, you first need to create an index template that configures a set of indices as a data stream. The `data_stream` object indicates that it’s a data stream and not a regular index template. The index pattern matches with the name of the data stream: +To create a data stream, you first need to create an index template that configures a set of indexes as a data stream. The `data_stream` object indicates that it’s a data stream and not a regular index template. The index pattern matches with the name of the data stream: ```json PUT _index_template/logs-template @@ -85,7 +85,7 @@ To see information about a specific data stream: GET _data_stream/logs-nginx ``` -#### Sample response +#### Example response ```json { @@ -109,7 +109,7 @@ GET _data_stream/logs-nginx } ``` -You can see the name of the timestamp field, the list of the backing indices, and the template that's used to create the data stream. You can also see the health of the data stream, which represents the lowest status of all its backing indices. +You can see the name of the timestamp field, the list of the backing indexes, and the template that's used to create the data stream. You can also see the health of the data stream, which represents the lowest status of all its backing indexes. To see more insights about the data stream, use the `_stats` endpoint: @@ -117,7 +117,7 @@ To see more insights about the data stream, use the `_stats` endpoint: GET _data_stream/logs-nginx/_stats ``` -#### Sample response +#### Example response ```json { @@ -140,6 +140,12 @@ GET _data_stream/logs-nginx/_stats } ``` +To see information about all data streams, use the following request: + +```json +GET _data_stream +``` + ### Step 3: Ingest data into the data stream To ingest data into a data stream, you can use the regular indexing APIs. Make sure every document that you index has a timestamp field. If you try to ingest a document that doesn't have a timestamp field, you get an error. @@ -155,7 +161,7 @@ POST logs-redis/_doc ### Step 4: Searching a data stream You can search a data stream just like you search a regular index or an index alias. -The search operation applies to all of the backing indices (all data present in the stream). +The search operation applies to all of the backing indexes (all data present in the stream). ```json GET logs-redis/_search @@ -168,7 +174,7 @@ GET logs-redis/_search } ``` -#### Sample response +#### Example response ```json { @@ -212,7 +218,7 @@ To perform manual rollover operation on the data stream: POST logs-redis/_rollover ``` -#### Sample response +#### Example response ```json { @@ -228,8 +234,8 @@ POST logs-redis/_rollover If you now perform a `GET` operation on the `logs-redis` data stream, you see that the generation ID is incremented from 1 to 2. -You can also set up an [Index State Management (ISM) policy]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/) to automate the rollover process for the data stream. -The ISM policy is applied to the backing indices at the time of their creation. When you associate a policy to a data stream, it only affects the future backing indices of that data stream. +You can also set up an [Index State Management (ISM) policy]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/) to automate the rollover process for the data stream. +The ISM policy is applied to the backing indexes at the time of their creation. When you associate a policy to a data stream, it only affects the future backing indexes of that data stream. You also don’t need to provide the `rollover_alias` setting, because the ISM policy infers this information from the backing index. @@ -237,7 +243,7 @@ You also don’t need to provide the `rollover_alias` setting, because the ISM p To manage data streams from OpenSearch Dashboards, open **OpenSearch Dashboards**, choose **Index Management**, select **Indices** or **Policy managed indices**. -You see a toggle switch for data streams that you can use to show or hide indices belonging to a data stream. +You see a toggle switch for data streams that you can use to show or hide indexes belonging to a data stream. When you enable this switch, you see a data stream multi-select dropdown menu that you can use for filtering data streams. You also see a data stream column that shows you the name of the data stream the index is contained in. @@ -250,9 +256,9 @@ You can performing visualizations on a data stream just like you would on a regu ### Step 7: Delete a data stream -The delete operation first deletes the backing indices of a data stream and then deletes the data stream itself. +The delete operation first deletes the backing indexes of a data stream and then deletes the data stream itself. -To delete a data stream and all of its hidden backing indices: +To delete a data stream and all of its hidden backing indexes: ```json DELETE _data_stream/ @@ -262,4 +268,4 @@ You can use wildcards to delete more than one data stream. We recommend deleting data from a data stream using an ISM policy. -You can also use [asynchronous search]({{site.url}}{{site.baseurl}}/search-plugins/async/index/) and [SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/index/) and [PPL]({{site.url}}{{site.baseurl}}/search-plugins/ppl/index/) to query your data stream directly. You can also use the security plugin to define granular permissions on the data stream name. +You can also use [asynchronous search]({{site.url}}{{site.baseurl}}/search-plugins/async/index/), [SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/index/), and [PPL]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/) to query your data stream directly. You can also use the Security plugin to define granular permissions for the data stream name. diff --git a/_im-plugin/index-alias.md b/_im-plugin/index-alias.md new file mode 100644 index 00000000..1816b0e0 --- /dev/null +++ b/_im-plugin/index-alias.md @@ -0,0 +1,274 @@ +--- +layout: default +title: Index aliases +nav_order: 11 +redirect_from: + - /opensearch/index-alias/ +--- + +# Index aliases + +An alias is a virtual index name that can point to one or more indexes. + +If your data is spread across multiple indexes, rather than keeping track of which indexes to query, you can create an alias and query it instead. + +For example, if you’re storing logs into indexes based on the month and you frequently query the logs for the previous two months, you can create a `last_2_months` alias and update the indexes it points to each month. + +Because you can change the indexes an alias points to at any time, referring to indexes using aliases in your applications allows you to reindex your data without any downtime. + +## Create aliases + +To create an alias, use a POST request: + +```json +POST _aliases +``` +{% include copy-curl.html %} + +Use the `actions` method to specify the list of actions that you want to perform. This command creates an alias named `alias1` and adds `index-1` to this alias: + +```json +POST _aliases +{ + "actions": [ + { + "add": { + "index": "index-1", + "alias": "alias1" + } + } + ] +} +``` +{% include copy-curl.html %} + +The following response is returned: + +```json +{ + "acknowledged": true +} +``` +{% include copy-curl.html %} + +If the request fails, make sure the index that you're adding to the alias already exists. + +You can also create an alias using one of the following requests: + +```json +PUT /_aliases/ +POST /_aliases/ +PUT /_alias/ +POST /_alias/ +``` +{% include copy-curl.html %} + +The `` in the above requests can be an index name, a comma-separated list of index names, or a wildcard expression. Use `_all` to refer to all indexes. + +To check if `alias1` refers to `index-1`, run one of the following commands: + +```json +GET /_alias/alias1 +GET /index-1/_alias/alias1 +``` +{% include copy-curl.html %} + +To get the indexes' mappings and settings information referenced by the alias, run the following command: + +```json +GET alias1 +``` +{% include copy-curl.html %} + +## Add or remove indexes + +You can perform multiple actions using the same `_aliases` operation. For example, the following command removes `index-1` and adds `index-2` to `alias1`: + +```json +POST _aliases +{ + "actions": [ + { + "remove": { + "index": "index-1", + "alias": "alias1" + } + }, + { + "add": { + "index": "index-2", + "alias": "alias1" + } + } + ] +} +``` +{% include copy-curl.html %} + +The `add` and `remove` actions occur atomically, which means that at no point will `alias1` point to both `index-1` and `index-2`. You can also add indexes based on an index pattern, as shown in the following POST request: + +```json +POST _aliases +{ + "actions": [ + { + "add": { + "index": "index*", + "alias": "alias1" + } + } + ] +} +``` +{% include copy-curl.html %} + +The `remove` action also supports the `must_exist` parameter. If the parameter is set to `true` and the specified alias does not exist, an exception is thrown. If the parameter is set to `false`, then no action is taken if the specified alias does not exist. The default value for `must_exist` is `null`. An exception will be thrown only if none of the specified aliases exist. + +The following POST request uses the `remove` action with the `must_exist` parameter set to `true`: + +```json +POST _aliases +{ + "actions": [ + { + "remove": { + "index": "index-1", + "alias": "alias1", + "must_exist": true + } + } + ] +} +``` +{% include copy-curl.html %} + +## Manage aliases + +To list the mapping of aliases to indexes, run the following command: + +```json +GET _cat/aliases?v +``` +{% include copy-curl.html %} + +#### Example response + +```json +alias index filter routing.index routing.search +alias1 index-1 * - - +``` +{% include copy-curl.html %} + +To check which indexes an alias points to, run the following command: + +```json +GET _alias/alias1 +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "index-2": { + "aliases": { + "alias1": {} + } + } +} +``` +{% include copy-curl.html %} + +Conversely, to find which alias points to a specific index, run the following command: + +```json +GET /index-2/_alias/* +``` +{% include copy-curl.html %} + +To get all index names and their aliases, run the following command: + +```json +GET /_alias +``` +{% include copy-curl.html %} + +To check if an alias exists, run one of the following commands: + +```json +HEAD /alias1/_alias/ +HEAD /_alias/alias1/ +HEAD index-1/_alias/alias1/ +``` +{% include copy-curl.html %} + +## Add aliases at index creation + +You can add an index to an alias as you create the index, as shown in the following PUT request: + +```json +PUT index-1 +{ + "aliases": { + "alias1": {} + } +} +``` +{% include copy-curl.html %} + +## Create filtered aliases + +You can create a filtered alias to access a subset of documents or fields in the underlying indexes. This command adds only a specific timestamp field to `alias1`. The following shows an example POST request: + +```json +POST _aliases +{ + "actions": [ + { + "add": { + "index": "index-1", + "alias": "alias1", + "filter": { + "term": { + "timestamp": "1574641891142" + } + } + } + } + ] +} +``` +{% include copy-curl.html %} + +## Index alias options + +You can specify the options shown in the following table. + +Option | Valid values | Description | Required +:--- | :--- | :--- +`index` | String | The name of the index that the alias points to. | Yes +`alias` | String | The name of the alias. | No +`filter` | Object | Add a filter to the alias. | No +`routing` | String | Limit search to an associated shard value. You can specify `search_routing` and `index_routing` independently. | No +`is_write_index` | String | Specify the index that accepts any write operations to the alias. If this value is not specified, then no write operations are allowed. | No + +## Delete aliases + +To delete one or more aliases from an index, use the following request: + +```json +DELETE /_alias/ +DELETE /_aliases/ +``` +{% include copy-curl.html %} + +Both `` and `` in the above request support comma-separated lists and wildcard expressions. Use `_all` in place of `` to delete all aliases for the indexes listed in ``. + +For example, if `alias1` refers to `index-1` and `index-2`, you can run the following command to remove `alias1` from `index-1`: + +```json +DELETE index-1/_alias/alias1 +``` +{% include copy-curl.html %} + +After running the request, `alias1` no longer refers to `index-1` but still refers to `index-2`. diff --git a/_im-plugin/index-codecs.md b/_im-plugin/index-codecs.md new file mode 100644 index 00000000..f880e141 --- /dev/null +++ b/_im-plugin/index-codecs.md @@ -0,0 +1,92 @@ +--- +layout: default +title: Index codecs +nav_order: 3 +parent: Index settings +--- + +# Index codecs + +Index codecs determine how the index’s stored fields are compressed and stored on disk. The index codec is controlled by the static `index.codec` setting that specifies the compression algorithm. The setting impacts the index shard size and index operation performance. + +## Supported codecs + +OpenSearch provides support for four codecs that can be used for compressing the stored fields. Each codec offers different tradeoffs between compression ratio (storage size) and indexing performance (speed): + +* `default` -- This codec employs the [LZ4 algorithm](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) with a preset dictionary, which prioritizes performance over compression ratio. It offers faster indexing and search operations when compared with `best_compression` but may result in larger index/shard sizes. If no codec is provided in the index settings, then LZ4 is used as the default algorithm for compression. +* `best_compression` -- This codec uses [zlib](https://en.wikipedia.org/wiki/Zlib) as an underlying algorithm for compression. It achieves high compression ratios that result in smaller index sizes. However, this may incur additional CPU usage during index operations and may subsequently result in high indexing and search latencies. + +As of OpenSearch 2.9, two new codecs based on the [Zstandard compression algorithm](https://github.com/facebook/zstd) are available. This algorithm provides a good balance between compression ratio and speed. + +It may be challenging to change the codec setting of an existing index (see [Changing an index codec](#changing-an-index-codec)), so it is important to test a representative workload in a non-production environment before using a new codec setting. +{: .important} + +* `zstd` (OpenSearch 2.9 and later) -- This codec provides significant compression comparable to the `best_compression` codec with reasonable CPU usage and improved indexing and search performance compared to the `default` codec. +* `zstd_no_dict` (OpenSearch 2.9 and later) -- This codec is similar to `zstd` but excludes the dictionary compression feature. It provides faster indexing and search operations compared to `zstd` at the expense of a slightly larger index size. + +As of OpenSearch 2.10, the `zstd` and `zstd_no_dict` compression codecs cannot be used for [k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/) or [Security Analytics]({{site.url}}{{site.baseurl}}/security-analytics/index/) indexes. +{: .warning} + +For the `zstd` and `zstd_no_dict` codecs, you can optionally specify a compression level in the `index.codec.compression_level` setting. This setting takes integers in the [1, 6] range. A higher compression level results in a higher compression ratio (smaller storage size) with a tradeoff in speed (slower compression and decompression speeds lead to greater indexing and search latencies). + +When an index segment is created, it uses the current index codec for compression. If you update the index codec, any segment created after the update will use the new compression algorithm. For specific operation considerations, see [Index codec considerations for index operations](#index-codec-considerations-for-index-operations). +{: .note} + +## Choosing a codec + +The choice of index codec impacts the amount of disk space required to store the index data. Codecs like `best_compression`, `zstd`, and `zstd_no_dict` can achieve higher compression ratios, resulting in smaller index sizes. Conversely, the `default` codec doesn’t prioritize compression ratio, resulting in larger index sizes but faster search operations than `best_compression`. + +## Index codec considerations for index operations + +The following index codec considerations apply to various index operations. + +### Writes + +Every index consists of shards, each of which is further divided into Lucene segments. During index writes, the new segments are created based on the codec specified in the index settings. If you update the codec for an index, the new segments will use the new codec algorithm. + +### Merges + +During segment merges, OpenSearch combines smaller index segments into larger segments in order to provide optimal resource utilization and improve performance. The index codec setting influences the speed and efficiency of the merge operations. The number of merges that happen on an index is a factor of the segment size, and a smaller segment size directly translates into smaller merge sizes. If you update the `index.codec` setting, the new merge operations will use the new codec when creating merged segments. The merged segments will have the compression characteristics of the new codec. + +### Splits and shrinks + +The [Split API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/split/) splits an original index into a new index where each original primary shard is divided into two or more primary shards. The [Shrink API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/shrink-index/) shrinks an existing index to a new index with a smaller number of primary shards. As part of split or shrink operations, any newly created segments will use the latest codec settings. + +### Snapshots + +When creating a [snapshot]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/index/), the index codec setting influences the size of the snapshot and the time required for its creation. If the codec of an index is updated, newly created snapshots will use the latest codec setting. The resulting snapshot size will reflect the compression characteristics of the latest codec setting. Existing segments included in the snapshot will retain their original compression characteristics. + +When you restore the indexes from a snapshot of a cluster to another cluster, it is important to verify that the target cluster supports the codecs of the segments in the source snapshot. For example, if the source snapshot contains segments of the `zstd` or `zstd_no_dict` codecs (introduced in OpenSearch 2.9), you won't be able to restore the snapshot to a cluster that runs on an older OpenSearch version because it doesn't support these codecs. + +### Reindexing + +When you are performing a [reindex]({{site.url}}{{site.baseurl}}/im-plugin/reindex-data/) operation from a source index, the new segments created in the target index will have the properties of the codec settings of the target index. + +### Index rollups and transforms + +When an index [rollup]({{site.url}}{{site.baseurl}}/im-plugin/index-rollups/) or [transform]({{site.url}}{{site.baseurl}}/im-plugin/index-transforms/) job is completed, the segments created in the target index will have the properties of the index codec specified during target index creation, irrespective of the source index codec. If the target index is created dynamically through a rollup job, the default codec is used for segments of the target index. + +## Changing an index codec + +It is not possible to change the codec setting of an open index. You can close the index, apply the new index codec setting, and reopen the index, at which point only new segments will be written with the new codec. This requires stopping all reads and writes to the index for a brief period to make the codec change and may result in inconsistent segment sizes and compression ratios. Alternatively, you can reindex all data from a source index into a new index with a different codec setting, though this is a very resource-intensive operation. + +## Performance tuning and benchmarking + +Depending on your specific use case, you might need to experiment with different index codec settings to fine-tune the performance of your OpenSearch cluster. Conducting benchmark tests with different codecs and measuring the impact on indexing speed, search performance, and resource utilization can help you identify the optimal index codec setting for your workload. With the `zstd` and `zstd_no_dict` codecs, you can also fine-tune the compression level in order to identify the optimal configuration for your cluster. + +### Benchmarking + +The following table provides a performance comparison of the `best_compression`, `zstd`, and `zstd_no_dict` codecs against the `default` codec. The tests were performed with the [`nyc_taxi`](https://github.com/topics/nyc-taxi-dataset) dataset. The results are listed in terms of percent change, and bold results indicate performance improvement. + +| | `best_compression` | `zstd` | `zstd_no_dict` | +|:--- |:--- |:--- |:--- | +|**Write** | | | +|Median Latency |0% |0% |−1% | +|p90 Latency |3% |2% |**−5%** | +|Throughput |−2% |**7%** |**14%** | +|**Read** | | | +|Median Latency |0% |1% |0% | +|p90 Latency |1% |1% |**−2%** | +|**Disk** | | | +| Compression ratio |**−34%** |**−35%** |**−30%** | + diff --git a/_im-plugin/index-rollups/index.md b/_im-plugin/index-rollups/index.md index 4637a95e..59cd304d 100644 --- a/_im-plugin/index-rollups/index.md +++ b/_im-plugin/index-rollups/index.md @@ -3,13 +3,13 @@ layout: default title: Index rollups nav_order: 35 has_children: true -redirect_from: /im-plugin/index-rollups/ -has_toc: false +redirect_from: + - /im-plugin/index-rollups/ --- # Index rollups -Time series data increases storage costs, strains cluster health, and slows down aggregations over time. Index rollup lets you periodically reduce data granularity by rolling up old data into summarized indices. +Time series data increases storage costs, strains cluster health, and slows down aggregations over time. Index rollup lets you periodically reduce data granularity by rolling up old data into summarized indexes. You pick the fields that interest you and use index rollup to create a new index with only those fields aggregated into coarser time buckets. You can store months or years of historical data at a fraction of the cost with the same query performance. @@ -18,7 +18,7 @@ For example, say you collect CPU consumption data every five seconds and store i You can use index rollup in three ways: 1. Use the index rollup API for an on-demand index rollup job that operates on an index that's not being actively ingested such as a rolled-over index. For example, you can perform an index rollup operation to reduce data collected at a five minute interval to a weekly average for trend analysis. -2. Use the OpenSearch Dashboards UI to create an index rollup job that runs on a defined schedule. You can also set it up to roll up your indices as it’s being actively ingested. For example, you can continuously roll up Logstash indices from a five second interval to a one hour interval. +2. Use the OpenSearch Dashboards UI to create an index rollup job that runs on a defined schedule. You can also set it up to roll up your indexes as it’s being actively ingested. For example, you can continuously roll up Logstash indexes from a five second interval to a one hour interval. 3. Specify the index rollup job as an ISM action for complete index management. This allows you to roll up an index after a certain event such as a rollover, index age reaching a certain point, index becoming read-only, and so on. You can also have rollover and index rollup jobs running in sequence, where the rollover first moves the current index to a warm node and then the index rollup job creates a new index with the minimized data on the hot node. ## Create an Index Rollup Job @@ -26,7 +26,7 @@ You can use index rollup in three ways: To get started, choose **Index Management** in OpenSearch Dashboards. Select **Rollup Jobs** and choose **Create rollup job**. -### Step 1: Set up indices +### Step 1: Set up indexes 1. In the **Job name and description** section, specify a unique name and an optional description for the index rollup job. 2. In the **Indices** section, select the source and target index. The source index is the one that you want to roll up. The source index remains as is, the index rollup job creates a new index referred to as a target index. The target index is where the index rollup results are saved. For target index, you can either type in a name for a new index or you select an existing index. @@ -48,7 +48,7 @@ The order in which you select attributes is critical. A city followed by a demog ### Step 3: Specify schedule -Specify a schedule to roll up your indices as it’s being ingested. The index rollup job is enabled by default. +Specify a schedule to roll up your indexes as it’s being ingested. The index rollup job is enabled by default. 1. Specify if the data is continuous or not. 3. For roll up execution frequency, select **Define by fixed interval** and specify the **Rollup interval** and the time unit or **Define by cron expression** and add in a cron expression to select the interval. To learn how to define a cron expression, see [Alerting]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/cron/). @@ -97,7 +97,7 @@ Then run a search: GET opensearch_dashboards_sample_data_ecommerce/_search ``` -#### Sample response +#### Example response ```json { @@ -303,7 +303,7 @@ PUT _plugins/_rollup/jobs/example ``` You can query the `example_rollup` index for the terms aggregations on the fields set up in the rollup job. -You get back the same response that you would on the original `opensearch_dashboards_sample_data_ecommerce` source index. +You get back the same response that you would on the original `opensearch_dashboards_sample_data_ecommerce` source index: ```json POST example_rollup/_search @@ -343,114 +343,494 @@ POST example_rollup/_search } ``` -#### Sample Response +#### Example response ```json { - "took": 476, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 + "took" : 14, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 }, - "hits": { - "total": { - "value": 281, - "relation": "eq" + "hits" : { + "total" : { + "value" : 281, + "relation" : "eq" }, - "max_score": null, - "hits": [] + "max_score" : null, + "hits" : [ ] }, - "aggregations": { - "daily_numbers": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ + "aggregations" : { + "daily_numbers" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { - "key": "Friday", - "doc_count": 53, - "total_revenue": { - "value": 4858.84375 + "key" : "Friday", + "doc_count" : 59, + "total_revenue" : { + "value" : 4858.84375 }, - "per_city": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { - "key": "Los Angeles", - "doc_count": 53, - "average quantity": { - "value": 2.305084745762712 + "key" : "Los Angeles", + "doc_count" : 59, + "average quantity" : { + "value" : 2.305084745762712 } } ] } }, { - "key": "Saturday", - "doc_count": 43, - "total_revenue": { - "value": 3547.203125 + "key" : "Saturday", + "doc_count" : 46, + "total_revenue" : { + "value" : 3547.203125 }, - "per_city": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { - "key": "Los Angeles", - "doc_count": 43, - "average quantity": { - "value": 2.260869565217391 + "key" : "Los Angeles", + "doc_count" : 46, + "average quantity" : { + "value" : 2.260869565217391 } } ] } }, { - "key": "Tuesday", - "doc_count": 42, - "total_revenue": { - "value": 3983.28125 + "key" : "Tuesday", + "doc_count" : 45, + "total_revenue" : { + "value" : 3983.28125 }, - "per_city": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { - "key": "Los Angeles", - "doc_count": 42, - "average quantity": { - "value": 2.2888888888888888 + "key" : "Los Angeles", + "doc_count" : 45, + "average quantity" : { + "value" : 2.2888888888888888 } } ] } }, { - "key": "Sunday", - "doc_count": 40, - "total_revenue": { - "value": 3308.1640625 + "key" : "Sunday", + "doc_count" : 44, + "total_revenue" : { + "value" : 3308.1640625 }, - "per_city": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { - "key": "Los Angeles", - "doc_count": 40, - "average quantity": { - "value": 2.090909090909091 + "key" : "Los Angeles", + "doc_count" : 44, + "average quantity" : { + "value" : 2.090909090909091 + } + } + ] + } + }, + { + "key" : "Thursday", + "doc_count" : 40, + "total_revenue" : { + "value" : 2876.125 + }, + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Los Angeles", + "doc_count" : 40, + "average quantity" : { + "value" : 2.3 + } + } + ] + } + }, + { + "key" : "Monday", + "doc_count" : 38, + "total_revenue" : { + "value" : 2673.453125 + }, + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Los Angeles", + "doc_count" : 38, + "average quantity" : { + "value" : 2.1578947368421053 + } + } + ] + } + }, + { + "key" : "Wednesday", + "doc_count" : 38, + "total_revenue" : { + "value" : 3202.453125 + }, + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Los Angeles", + "doc_count" : 38, + "average quantity" : { + "value" : 2.236842105263158 } } ] } } - ... ] } } } ``` + +## The doc_count field + +The `doc_count` field in bucket aggregations contains the number of documents collected in each bucket. When calculating the bucket's `doc_count`, the number of documents is incremented by the number of the pre-aggregated documents in each summary document. The `doc_count` returned from rollup searches represents the total number of matching documents from the source index. The document count for each bucket is the same whether you search the source index or the rollup target index. + +## Query string queries + +To take advantage of shorter and more easily written strings in Query DSL, you can use [query strings]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text/query-string/) to simplify search queries in rollup indexes. To use query strings, add the following fields to your rollup search request: + +```json +"query": { + "query_string": { + "query": "field_name:field_value" + } + } +``` + +The following example uses a query string with a `*` wildcard operator to search inside a rollup index called `my_server_logs_rollup`: + +```json +GET my_server_logs_rollup/_search +{ + "size": 0, + "query": { + "query_string": { + "query": "email* OR inventory", + "default_field": "service_name" + } + }, + + "aggs": { + "service_name": { + "terms": { + "field": "service_name" + }, + "aggs": { + "region": { + "terms": { + "field": "region" + }, + "aggs": { + "average quantity": { + "avg": { + "field": "cpu_usage" + } + } + } + } + } + } + } +} +``` + +For more information about query string query parameters, see [Query string query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text/query-string/#parameters). + +## Dynamic target index + + + +In ISM rollup, the `target_index` field may contain a template that is compiled at the time of each rollup indexing. For example, if you specify the `target_index` field as `{% raw %}rollup_ndx-{{ctx.source_index}}{% endraw %}`, the source index `log-000001` will roll up into a target index `rollup_ndx-log-000001`. This allows you to roll up data into multiple time-based indexes, with one rollup job created for each source index. + +The `source_index` parameter in {% raw %}`{{ctx.source_index}}`{% endraw %} cannot contain wildcards. +{: .note} + +## Searching multiple rollup indexes + +When data is rolled up into multiple target indexes, you can run one search across all of the rollup indexes. To search multiple target indexes that have the same rollup, specify the index names as a comma-separated list or a wildcard pattern. For example, with `target_index` as `{% raw %}rollup_ndx-{{ctx.source_index}}{% endraw %}` and source indexes that start with `log`, specify the `rollup_ndx-log*` pattern. Or, to search for rolled up log-000001 and log-000002 indexes, specify the `rollup_ndx-log-000001,rollup_ndx-log-000002` list. + +You cannot search a mix of rollup and non-rollup indexes with the same query. +{: .note} + +## Example + +The following example demonstrates the `doc_count` field, dynamic index names, and searching multiple rollup indexes with the same rollup. + +**Step 1:** Add an index template for ISM to manage the rolling over of the indexes aliased by `log`: + +```json +PUT _index_template/ism_rollover +{ + "index_patterns": ["log*"], + "template": { + "settings": { + "plugins.index_state_management.rollover_alias": "log" + } + } +} +``` + +**Step 2:** Set up an ISM rollover policy to roll over any index whose name starts with `log*` after one document is uploaded to it, and then roll up the individual backing index. The target index name is dynamically generated from the source index name by prepending the string `rollup_ndx-` to the source index name. + +```json +PUT _plugins/_ism/policies/rollover_policy +{ + "policy": { + "description": "Example rollover policy.", + "default_state": "rollover", + "states": [ + { + "name": "rollover", + "actions": [ + { + "rollover": { + "min_doc_count": 1 + } + } + ], + "transitions": [ + { + "state_name": "rp" + } + ] + }, + { + "name": "rp", + "actions": [ + { + "rollup": { + "ism_rollup": { + "target_index": {% raw %}"rollup_ndx-{{ctx.source_index}}"{% endraw %}, + "description": "Example rollup job", + "page_size": 200, + "dimensions": [ + { + "date_histogram": { + "source_field": "ts", + "fixed_interval": "60m", + "timezone": "America/Los_Angeles" + } + }, + { + "terms": { + "source_field": "message.keyword" + } + } + ], + "metrics": [ + { + "source_field": "msg_size", + "metrics": [ + { + "sum": {} + } + ] + } + ] + } + } + } + ], + "transitions": [] + } + ], + "ism_template": { + "index_patterns": ["log*"], + "priority": 100 + } + } +} +``` + +**Step 3:** Create an index named `log-000001` and set up an alias `log` for it. + +```json +PUT log-000001 +{ + "aliases": { + "log": { + "is_write_index": true + } + } +} +``` + +**Step 4:** Index four documents into the index created above. Two of the documents have the message "Success", and two have the message "Error". + +```json +POST log/_doc?refresh=true +{ + "ts" : "2022-08-26T09:28:48-04:00", + "message": "Success", + "msg_size": 10 +} +``` + +```json +POST log/_doc?refresh=true +{ + "ts" : "2022-08-26T10:06:25-04:00", + "message": "Error", + "msg_size": 20 +} +``` + +```json +POST log/_doc?refresh=true +{ + "ts" : "2022-08-26T10:23:54-04:00", + "message": "Error", + "msg_size": 30 +} +``` + +```json +POST log/_doc?refresh=true +{ + "ts" : "2022-08-26T10:53:41-04:00", + "message": "Success", + "msg_size": 40 +} +``` + +Once you index the first document, the rollover action is executed. This action creates the index `log-000002` with `rollover_policy` attached to it. Then the rollup action is executed, which creates the rollup index `rollup_ndx-log-000001`. + +To monitor the status of rollover and rollup index creation, you can use the ISM explain API: `GET _plugins/_ism/explain` +{: .tip} + +**Step 5:** Search the rollup index. + +```json +GET rollup_ndx-log-*/_search +{ + "size": 0, + "query": { + "match_all": {} + }, + "aggregations": { + "message_numbers": { + "terms": { + "field": "message.keyword" + }, + "aggs": { + "per_message": { + "terms": { + "field": "message.keyword" + }, + "aggregations": { + "sum_message": { + "sum": { + "field": "msg_size" + } + } + } + } + } + } + } +} +``` + +The response contains two buckets, "Error" and "Success", and the document count for each bucket is 2: + +```json +{ + "took" : 30, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 4, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "message_numbers" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Success", + "doc_count" : 2, + "per_message" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Success", + "doc_count" : 2, + "sum_message" : { + "value" : 50.0 + } + } + ] + } + }, + { + "key" : "Error", + "doc_count" : 2, + "per_message" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Error", + "doc_count" : 2, + "sum_message" : { + "value" : 50.0 + } + } + ] + } + } + ] + } + } +} +``` + +## Index codec considerations + +For index codec considerations, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#index-rollups-and-transforms). \ No newline at end of file diff --git a/_im-plugin/index-rollups/rollup-api.md b/_im-plugin/index-rollups/rollup-api.md index 7aa878d3..61bfdf76 100644 --- a/_im-plugin/index-rollups/rollup-api.md +++ b/_im-plugin/index-rollups/rollup-api.md @@ -50,38 +50,42 @@ PUT _plugins/_rollup/jobs/?if_seq_no=1&if_primary_term=1 // Update "example_rollup_index_all" ], "continuous": false, - "dimensions": { - "date_histogram": { - "source_field": "tpep_pickup_datetime", - "fixed_interval": "1h", - "timezone": "America/Los_Angeles" - }, - "terms": { - "source_field": "PULocationID" - }, - "metrics": [ - { - "source_field": "passenger_count", - "metrics": [ - { - "avg": {} - }, - { - "sum": {} - }, - { - "max": {} - }, - { - "min": {} - }, - { - "value_count": {} - } - ] + "dimensions": [ + { + "date_histogram": { + "source_field": "tpep_pickup_datetime", + "fixed_interval": "1h", + "timezone": "America/Los_Angeles" } - ] - } + }, + { + "terms": { + "source_field": "PULocationID" + } + } + ], + "metrics": [ + { + "source_field": "passenger_count", + "metrics": [ + { + "avg": {} + }, + { + "sum": {} + }, + { + "max": {} + }, + { + "min": {} + }, + { + "value_count": {} + } + ] + } + ] } } ``` @@ -91,7 +95,7 @@ You can specify the following options. Options | Description | Type | Required :--- | :--- |:--- |:--- | `source_index` | The name of the detector. | String | Yes -`target_index` | Specify the target index that the rolled up data is ingested into. You could either create a new target index or use an existing index. The target index cannot be a combination of raw and rolled up data. | String | Yes +`target_index` | Specify the target index that the rolled up data is ingested into. You can either create a new target index or use an existing index. The target index cannot be a combination of raw and rolled up data. This field supports dynamically generated index names like {% raw %}`rollup_{{ctx.source_index}}`{% endraw %}, where `source_index` cannot contain wildcards. | String | Yes `schedule` | Schedule of the index rollup job which can be an interval or a cron expression. | Object | Yes `schedule.interval` | Specify the frequency of execution of the rollup job. | Object | No `schedule.interval.start_time` | Start time of the interval. | Timestamp | Yes @@ -103,33 +107,81 @@ Options | Description | Type | Required `description` | Optionally, describe the rollup job. | String | No `enabled` | When true, the index rollup job is scheduled. Default is true. | Boolean | Yes `continuous` | Specify whether or not the index rollup job continuously rolls up data forever or just executes over the current data set once and stops. Default is false. | Boolean | Yes -`error_notification` | Set up a Mustache message template sent for error notifications. For example, if an index rollup job fails, the system sends a message to a Slack channel. | Object | No -`page_size` | Specify the number of buckets to paginate through at a time while rolling up. | Number | Yes +`error_notification` | Set up a Mustache message template for error notifications. For example, if an index rollup job fails, the system sends a message to a Slack channel. | Object | No +`page_size` | Specify the number of buckets to paginate at a time during rollup. | Number | Yes `delay` | The number of milliseconds to delay execution of the index rollup job. | Long | No -`dimensions` | Specify aggregations to create dimensions for the roll up time window. | Object | Yes -`dimensions.date_histogram` | Specify either fixed_interval or calendar_interval, but not both. Either one limits what you can query in the target index. | Object | No -`dimensions.date_histogram.fixed_interval` | Specify the fixed interval for aggregations in milliseconds, seconds, minutes, hours, or days. | String | No -`dimensions.date_histogram.calendar_interval` | Specify the calendar interval for aggregations in minutes, hours, days, weeks, months, quarters, or years. | String | No -`dimensions.date_histogram.field` | Specify the date field used in date histogram aggregation. | String | No -`dimensions.date_histogram.timezone` | Specify the timezones as defined by the IANA Time Zone Database. The default is UTC. | String | No -`dimensions.terms` | Specify the term aggregations that you want to roll up. | Object | No -`dimensions.terms.fields` | Specify terms aggregation for compatible fields. | Object | No -`dimensions.histogram` | Specify the histogram aggregations that you want to roll up. | Object | No -`dimensions.histogram.field` | Add a field for histogram aggregations. | String | Yes -`dimensions.histogram.interval` | Specify the histogram aggregation interval for the field. | Long | Yes -`dimensions.metrics` | Specify a list of objects that represent the fields and metrics that you want to calculate. | Nested object | No -`dimensions.metrics.field` | Specify the field that you want to perform metric aggregations on. | String | No -`dimensions.metrics.field.metrics` | Specify the metric aggregations you want to calculate for the field. | Multiple strings | No +`dimensions` | Specify aggregations to create dimensions for the roll up time window. Supported groups are `terms`, `histogram`, and `date_histogram`. For more information, see [Bucket Aggregations]({{site.url}}{{site.baseurl}}/opensearch/bucket-agg). | Array | Yes +`metrics` | Specify a list of objects that represent the fields and metrics that you want to calculate. Supported metrics are `sum`, `max`, `min`, `value_count` and `avg`. For more information, see [Metric Aggregations]({{site.url}}{{site.baseurl}}/opensearch/metric-agg). | Array | No -#### Sample response +#### Example response ```json { - "_id": "rollup_id", - "_seqNo": 1, - "_primaryTerm": 1, - "rollup": { ... } + "_id": "", + "_version": 3, + "_seq_no": 1, + "_primary_term": 1, + "rollup": { + "rollup_id": "", + "enabled": true, + "schedule": { + "interval": { + "start_time": 1680159934649, + "period": 1, + "unit": "Days", + "schedule_delay": 0 + } + }, + "last_updated_time": 1680159934649, + "enabled_time": 1680159934649, + "description": "Example rollup job", + "schema_version": 17, + "source_index": "nyc-taxi-data", + "target_index": "rollup-nyc-taxi-data", + "metadata_id": null, + "page_size": 200, + "delay": 0, + "continuous": false, + "dimensions": [ + { + "date_histogram": { + "fixed_interval": "1h", + "source_field": "tpep_pickup_datetime", + "target_field": "tpep_pickup_datetime", + "timezone": "America/Los_Angeles" + } + }, + { + "terms": { + "source_field": "PULocationID", + "target_field": "PULocationID" + } + } + ], + "metrics": [ + { + "source_field": "passenger_count", + "metrics": [ + { + "avg": {} + }, + { + "sum": {} + }, + { + "max": {} + }, + { + "min": {} + }, + { + "value_count": {} + } + ] + } + ] + } } ``` @@ -147,7 +199,7 @@ GET _plugins/_rollup/jobs/ ``` -#### Sample response +#### Example response ```json { @@ -173,7 +225,7 @@ Deletes an index rollup job based on the `rollup_id`. DELETE _plugins/_rollup/jobs/ ``` -#### Sample response +#### Example response ```json 200 OK @@ -196,7 +248,7 @@ POST _plugins/_rollup/jobs//_stop ``` -#### Sample response +#### Example response ```json 200 OK @@ -218,7 +270,7 @@ GET _plugins/_rollup/jobs//_explain ``` -#### Sample response +#### Example response ```json { diff --git a/_im-plugin/index-rollups/settings.md b/_im-plugin/index-rollups/settings.md new file mode 100644 index 00000000..3ee1be33 --- /dev/null +++ b/_im-plugin/index-rollups/settings.md @@ -0,0 +1,22 @@ +--- +layout: default +title: Settings +parent: Index rollups +nav_order: 30 +--- + +# Index rollup settings + +We don't recommend changing these settings; the defaults should work well for most use cases. + +All settings are available using the OpenSearch `_cluster/settings` operation. None require a restart, and all can be marked `persistent` or `transient`. To learn more about static and dynamic settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). + +Setting | Default | Description +:--- | :--- | :--- +`plugins.rollup.search.backoff_millis` | 1000 milliseconds | The backoff time between retries for failed rollup jobs. +`plugins.rollup.search.backoff_count` | 5 | How many retries the plugin should attempt for failed rollup jobs. +`plugins.rollup.search.search_all_jobs` | false | Whether OpenSearch should return all jobs that match all specified search terms. If disabled, OpenSearch returns just one, as opposed to all, of the jobs that matches the search terms. +`plugins.rollup.dashboards.enabled` | true | Whether rollups are enabled in OpenSearch Dashboards. +`plugins.rollup.enabled` | true | Whether the rollup plugin is enabled. +`plugins.ingest.backoff_millis` | 1000 milliseconds | The backoff time between data ingestions for rollup jobs. +`plugins.ingest.backoff_count` | 5 | How many retries the plugin should attempt for failed ingestions. diff --git a/_opensearch/index-templates.md b/_im-plugin/index-templates.md similarity index 80% rename from _opensearch/index-templates.md rename to _im-plugin/index-templates.md index 3e9f18a0..015c9908 100644 --- a/_opensearch/index-templates.md +++ b/_im-plugin/index-templates.md @@ -1,22 +1,25 @@ --- layout: default title: Index templates -nav_order: 15 +nav_order: 6 +redirect_from: + - /opensearch/index-templates/ --- # Index templates -Index templates let you initialize new indices with predefined mappings and settings. For example, if you continuously index log data, you can define an index template so that all of these indices have the same number of shards and replicas. +Index templates let you initialize new indexes with predefined mappings and settings. For example, if you continuously index log data, you can define an index template so that all of these indexes have the same number of shards and replicas. ### Create a template -To create an index template, use a POST request: +To create an index template, use a PUT or POST request: ```json -POST _index_template +PUT _index_template/